From 91e9fa4f8518f21c67917f66bc83fb14bc71c80f Mon Sep 17 00:00:00 2001
From: GNU Libc Maintainers <debian-glibc@lists.debian.org>
Date: Sun, 28 Dec 2025 17:32:33 +0100
Subject: [PATCH] git-updates

GIT update of https://sourceware.org/git/glibc.git/release/2.41/master from glibc-2.41

GIT update of https://sourceware.org/git/glibc.git/release/2.41/master from glibc-2.41


Gbp-Pq: Name git-updates.diff
---
 ADVISORIES                                    |   2 +
 Makeconfig                                    |   2 +-
 NEWS                                          |  33 ++
 advisories/GLIBC-SA-2023-0001                 |  14 -
 advisories/GLIBC-SA-2023-0002                 |  15 -
 advisories/GLIBC-SA-2023-0003                 |  15 -
 advisories/GLIBC-SA-2023-0004                 |  16 -
 advisories/GLIBC-SA-2023-0005                 |  18 -
 advisories/GLIBC-SA-2024-0001                 |  15 -
 advisories/GLIBC-SA-2024-0002                 |  15 -
 advisories/GLIBC-SA-2024-0003                 |  13 -
 advisories/GLIBC-SA-2024-0004                 |  28 --
 advisories/GLIBC-SA-2024-0005                 |  22 -
 advisories/GLIBC-SA-2024-0006                 |  32 --
 advisories/GLIBC-SA-2024-0007                 |  28 --
 advisories/GLIBC-SA-2024-0008                 |  26 --
 advisories/GLIBC-SA-2025-0001                 |  25 --
 advisories/README                             |  77 ----
 assert/Makefile                               |   1 +
 assert/tst-assert-sa-2025-0001.c              |  92 +++++
 benchtests/atanh-inputs                       |   1 +
 benchtests/sinh-inputs                        |   1 +
 config.make.in                                |   1 +
 configure                                     | 162 ++++++++
 configure.ac                                  |  32 ++
 elf/Makefile                                  |  45 +-
 elf/dl-execstack-tunable.c                    |  39 ++
 elf/dl-execstack.c                            |   2 +-
 elf/dl-find_object.c                          |  79 ++--
 elf/dl-find_object.h                          |   4 +-
 elf/dl-load.c                                 |   4 +-
 elf/dl-reloc-static-pie.c                     |   3 +-
 elf/dl-support.c                              |   4 +-
 elf/dl-tls.c                                  |   7 +
 elf/dl-tunables.list                          |   2 +-
 elf/rtld.c                                    |  88 ++--
 elf/tst-audit-tlsdesc-dlopen2.c               |  46 +++
 elf/tst-auditmod-tlsdesc2.c                   |  59 +++
 elf/tst-dlopen-sgid-mod.c                     |   1 +
 elf/tst-dlopen-sgid.c                         | 106 +++++
 elf/tst-env-setuid-tunables.c                 |  18 +-
 elf/tst-env-setuid.c                          |  17 +-
 elf/tst-execstack-prog-static-tunable.c       |   1 +
 elf/tst-execstack-tunable.c                   |   1 +
 elf/tst-link-map-contiguous-ldso.c            |  98 +++++
 elf/tst-link-map-contiguous-libc.c            |  57 +++
 elf/tst-link-map-contiguous-main.c            |  45 ++
 elf/tst-pie-bss-static.c                      |  19 +
 .../strcmp-power10.S => elf/tst-pie-bss.c     |  20 +-
 elf/tst-rtld-list-tunables.exp                |   2 +-
 iconv/iconv_prog.c                            |   4 +-
 iconv/tst-iconv_prog-buffer.sh                |   4 +
 include/dlfcn.h                               |   3 +-
 math/auto-libm-test-in                        |   4 +
 math/auto-libm-test-out-log10p1               |  25 ++
 math/auto-libm-test-out-sinh                  |  25 ++
 math/auto-libm-test-out-tan                   |  25 ++
 math/bits/mathcalls-macros.h                  |   2 +-
 nptl/Makefile                                 |   3 +
 nptl/cancellation.c                           |   4 +-
 nptl/pthread_cancel.c                         |  14 +-
 nptl/pthread_getattr_np.c                     |   4 +-
 posix/Makefile                                |   1 +
 posix/environ.c                               |   4 +
 posix/regcomp.c                               |   4 +-
 posix/tst-regcomp-bracket-free.c              | 176 ++++++++
 stdlib/Makefile                               |   2 +
 stdlib/abort.c                                |   6 +-
 stdlib/getenv.c                               |   3 -
 stdlib/tst-getenv-static.c                    |  38 ++
 stdlib/tst-secure-getenv.c                    |   9 +-
 support/capture_subprocess.h                  |  11 +-
 support/support_capture_subprocess.c          | 166 ++++----
 sysdeps/aarch64/fpu/acos_advsimd.c            |  56 ++-
 sysdeps/aarch64/fpu/acos_sve.c                |  75 ++--
 sysdeps/aarch64/fpu/acosh_sve.c               |   6 +-
 sysdeps/aarch64/fpu/asin_advsimd.c            |  46 ++-
 sysdeps/aarch64/fpu/asin_sve.c                |  74 ++--
 sysdeps/aarch64/fpu/asinf_advsimd.c           |  31 +-
 sysdeps/aarch64/fpu/asinh_sve.c               | 111 +++--
 sysdeps/aarch64/fpu/atan2_advsimd.c           | 128 +++---
 sysdeps/aarch64/fpu/atan2_sve.c               | 107 +++--
 sysdeps/aarch64/fpu/atan2f_advsimd.c          |  64 +--
 sysdeps/aarch64/fpu/atan2f_sve.c              |  61 +--
 sysdeps/aarch64/fpu/atan_advsimd.c            |  83 ++--
 sysdeps/aarch64/fpu/atan_sve.c                | 104 +++--
 sysdeps/aarch64/fpu/atanf_advsimd.c           |  97 +++--
 sysdeps/aarch64/fpu/atanf_sve.c               |  84 ++--
 sysdeps/aarch64/fpu/atanh_sve.c               |   3 +-
 sysdeps/aarch64/fpu/cosh_sve.c                | 135 +++---
 sysdeps/aarch64/fpu/coshf_sve.c               |   6 +-
 sysdeps/aarch64/fpu/erfcf_sve.c               |  12 +-
 sysdeps/aarch64/fpu/exp10_sve.c               |  25 +-
 sysdeps/aarch64/fpu/exp10f_sve.c              |  53 +--
 sysdeps/aarch64/fpu/exp2_sve.c                |  76 ++--
 sysdeps/aarch64/fpu/exp2f_sve.c               |  35 +-
 sysdeps/aarch64/fpu/exp_sve.c                 |  36 +-
 sysdeps/aarch64/fpu/expf_sve.c                |   6 +-
 sysdeps/aarch64/fpu/expm1_sve.c               | 202 ++++++---
 sysdeps/aarch64/fpu/log1p_sve.c               |  84 +++-
 sysdeps/aarch64/fpu/pow_sve.c                 | 245 ++++++-----
 sysdeps/aarch64/fpu/powf_sve.c                | 117 +++---
 sysdeps/aarch64/fpu/sinh_sve.c                | 165 +++++---
 sysdeps/aarch64/fpu/sv_expf_inline.h          |  31 +-
 sysdeps/aarch64/fpu/sv_log1p_inline.h         |  86 +++-
 sysdeps/aarch64/fpu/tanh_sve.c                | 154 ++++---
 sysdeps/aarch64/multiarch/Makefile            |   1 +
 sysdeps/aarch64/multiarch/ifunc-impl-list.c   |   1 +
 sysdeps/aarch64/multiarch/memset.c            |   4 +
 sysdeps/aarch64/multiarch/memset_sve_zva64.S  | 123 ++++++
 sysdeps/arm/find_exidx.c                      |   3 +-
 sysdeps/generic/ldsodefs.h                    |  15 +-
 sysdeps/ieee754/dbl-64/e_atanh.c              |   8 +
 sysdeps/ieee754/dbl-64/e_sinh.c               |   8 +
 sysdeps/ieee754/dbl-64/math_config.h          |   6 +-
 sysdeps/ieee754/dbl-64/s_fma.c                |   3 +
 sysdeps/ieee754/dbl-64/s_tanh.c               |   5 +
 sysdeps/ieee754/flt-32/e_sinhf.c              |   2 +-
 sysdeps/ieee754/flt-32/s_log10p1f.c           |   2 +-
 sysdeps/ieee754/flt-32/s_tanf.c               |   2 +-
 sysdeps/mach/hurd/dl-execstack.c              |   5 +-
 sysdeps/nptl/bits/thread-shared-types.h       |   2 +
 sysdeps/nptl/dl-tls_init_tp.c                 |   1 +
 sysdeps/nptl/pthread.h                        |   2 +-
 sysdeps/powerpc/powerpc64/le/power10/memchr.S | 315 --------------
 sysdeps/powerpc/powerpc64/le/power10/strcmp.S | 233 -----------
 .../powerpc/powerpc64/le/power10/strncmp.S    | 271 ------------
 sysdeps/powerpc/powerpc64/multiarch/Makefile  |  11 +-
 .../powerpc64/multiarch/ifunc-impl-list.c     |  13 -
 sysdeps/powerpc/powerpc64/multiarch/memchr.c  |  20 +-
 sysdeps/powerpc/powerpc64/multiarch/strcmp.c  |   4 -
 sysdeps/powerpc/powerpc64/multiarch/strncmp.c |   4 -
 sysdeps/pthread/Makefile                      |  11 +
 sysdeps/pthread/tst-cancel32.c                |  73 ++++
 sysdeps/pthread/tst-stack2-mod.c              |  39 ++
 sysdeps/pthread/tst-stack2.c                  |  47 +++
 sysdeps/riscv/dl-machine.h                    |  17 +-
 sysdeps/sparc/sparc32/start.S                 |  11 +-
 sysdeps/sparc/sparc64/start.S                 |   4 +
 sysdeps/unix/sysv/linux/aarch64/Makefile      | 129 +++++-
 .../unix/sysv/linux/aarch64/cpu-features.c    |   1 +
 .../sysv/linux/aarch64/tst-aarch64-pkey.c     |   4 +
 .../unix/sysv/linux/aarch64/tst-gcs-abort.sh  |  39 ++
 .../linux/aarch64/tst-gcs-disabled-static.c   |   1 +
 .../sysv/linux/aarch64/tst-gcs-disabled.c     |   2 +
 .../linux/aarch64/tst-gcs-dlopen-disabled.c   |   3 +
 .../linux/aarch64/tst-gcs-dlopen-enforced.c   |   3 +
 .../aarch64/tst-gcs-dlopen-optional-off.c     |   3 +
 .../aarch64/tst-gcs-dlopen-optional-on.c      |   3 +
 .../linux/aarch64/tst-gcs-dlopen-override.c   |   3 +
 .../unix/sysv/linux/aarch64/tst-gcs-dlopen.c  |  62 +++
 .../linux/aarch64/tst-gcs-enforced-abort.c    |   2 +
 .../aarch64/tst-gcs-enforced-static-abort.c   |   1 +
 .../linux/aarch64/tst-gcs-enforced-static.c   |   1 +
 .../sysv/linux/aarch64/tst-gcs-enforced.c     |   2 +
 .../unix/sysv/linux/aarch64/tst-gcs-helper.h  |  39 ++
 .../sysv/linux/aarch64/tst-gcs-mod1.c}        |  16 +-
 .../sysv/linux/aarch64/tst-gcs-mod2.c}        |  19 +-
 .../unix/sysv/linux/aarch64/tst-gcs-mod3.c    |  25 ++
 .../sysv/linux/aarch64/tst-gcs-noreturn.c     | 101 +++++
 .../sysv/linux/aarch64/tst-gcs-optional-off.c |   2 +
 .../sysv/linux/aarch64/tst-gcs-optional-on.c  |   2 +
 .../aarch64/tst-gcs-optional-static-off.c     |   1 +
 .../aarch64/tst-gcs-optional-static-on.c      |   1 +
 .../linux/aarch64/tst-gcs-override-static.c   |   1 +
 .../sysv/linux/aarch64/tst-gcs-override.c     |   2 +
 .../linux/aarch64/tst-gcs-shared-disabled.c   |   2 +
 .../aarch64/tst-gcs-shared-enforced-abort.c   |   2 +
 .../linux/aarch64/tst-gcs-shared-optional.c   |   2 +
 .../linux/aarch64/tst-gcs-shared-override.c   |   2 +
 .../unix/sysv/linux/aarch64/tst-gcs-shared.c  |  41 ++
 .../sysv/linux/aarch64/tst-gcs-skeleton.c     |  43 ++
 sysdeps/unix/sysv/linux/bits/sched.h          |   2 +-
 sysdeps/unix/sysv/linux/dl-execstack.c        |   7 +-
 sysdeps/unix/sysv/linux/rseq-internal.h       |  11 +-
 sysdeps/x86/Makefile                          |  22 +
 sysdeps/x86/bits/floatn.h                     |  10 +-
 sysdeps/x86/cpu-features.c                    | 389 +++++++++---------
 sysdeps/x86/cpu-tunables.c                    |   2 +
 sysdeps/x86/dl-diagnostics-cpu.c              |   2 +
 sysdeps/x86/include/cpu-features.h            |   9 +-
 sysdeps/x86/sysdep.h                          |   6 +
 sysdeps/x86/tst-gnu2-tls2-x86-noxsave.c       |   1 +
 sysdeps/x86/tst-gnu2-tls2-x86-noxsavec.c      |   1 +
 sysdeps/x86/tst-gnu2-tls2-x86-noxsavexsavec.c |   1 +
 sysdeps/x86_64/Makefile                       |   1 -
 sysdeps/x86_64/dl-tlsdesc-dynamic.h           |   2 +-
 sysdeps/x86_64/fpu/multiarch/Makefile         |   6 +
 sysdeps/x86_64/fpu/multiarch/e_atanh-fma.c    |   6 +
 sysdeps/x86_64/fpu/multiarch/e_atanh.c        |  34 ++
 sysdeps/x86_64/fpu/multiarch/e_sinh-fma.c     |  12 +
 sysdeps/x86_64/fpu/multiarch/e_sinh.c         |  35 ++
 sysdeps/x86_64/fpu/multiarch/s_tanh-fma.c     |  11 +
 sysdeps/x86_64/fpu/multiarch/s_tanh.c         |  31 ++
 sysdeps/x86_64/multiarch/ifunc-impl-list.c    |   4 +-
 sysdeps/x86_64/tst-auditmod10b.c              | 109 ++---
 196 files changed, 4334 insertions(+), 2794 deletions(-)
 create mode 100644 ADVISORIES
 delete mode 100644 advisories/GLIBC-SA-2023-0001
 delete mode 100644 advisories/GLIBC-SA-2023-0002
 delete mode 100644 advisories/GLIBC-SA-2023-0003
 delete mode 100644 advisories/GLIBC-SA-2023-0004
 delete mode 100644 advisories/GLIBC-SA-2023-0005
 delete mode 100644 advisories/GLIBC-SA-2024-0001
 delete mode 100644 advisories/GLIBC-SA-2024-0002
 delete mode 100644 advisories/GLIBC-SA-2024-0003
 delete mode 100644 advisories/GLIBC-SA-2024-0004
 delete mode 100644 advisories/GLIBC-SA-2024-0005
 delete mode 100644 advisories/GLIBC-SA-2024-0006
 delete mode 100644 advisories/GLIBC-SA-2024-0007
 delete mode 100644 advisories/GLIBC-SA-2024-0008
 delete mode 100644 advisories/GLIBC-SA-2025-0001
 delete mode 100644 advisories/README
 create mode 100644 assert/tst-assert-sa-2025-0001.c
 create mode 100644 elf/dl-execstack-tunable.c
 create mode 100644 elf/tst-audit-tlsdesc-dlopen2.c
 create mode 100644 elf/tst-auditmod-tlsdesc2.c
 create mode 100644 elf/tst-dlopen-sgid-mod.c
 create mode 100644 elf/tst-dlopen-sgid.c
 create mode 100644 elf/tst-execstack-prog-static-tunable.c
 create mode 100644 elf/tst-execstack-tunable.c
 create mode 100644 elf/tst-link-map-contiguous-ldso.c
 create mode 100644 elf/tst-link-map-contiguous-libc.c
 create mode 100644 elf/tst-link-map-contiguous-main.c
 create mode 100644 elf/tst-pie-bss-static.c
 rename sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S => elf/tst-pie-bss.c (66%)
 create mode 100644 posix/tst-regcomp-bracket-free.c
 create mode 100644 stdlib/tst-getenv-static.c
 create mode 100644 sysdeps/aarch64/multiarch/memset_sve_zva64.S
 delete mode 100644 sysdeps/powerpc/powerpc64/le/power10/memchr.S
 delete mode 100644 sysdeps/powerpc/powerpc64/le/power10/strcmp.S
 delete mode 100644 sysdeps/powerpc/powerpc64/le/power10/strncmp.S
 create mode 100644 sysdeps/pthread/tst-cancel32.c
 create mode 100644 sysdeps/pthread/tst-stack2-mod.c
 create mode 100644 sysdeps/pthread/tst-stack2.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-abort.sh
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-disabled-static.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-disabled.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-disabled.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-enforced.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-optional-off.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-optional-on.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-override.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-enforced-abort.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-enforced-static-abort.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-enforced-static.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-enforced.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-helper.h
 rename sysdeps/{powerpc/powerpc64/multiarch/strncmp-power10.S => unix/sysv/linux/aarch64/tst-gcs-mod1.c} (72%)
 rename sysdeps/{powerpc/powerpc64/multiarch/memchr-power10.S => unix/sysv/linux/aarch64/tst-gcs-mod2.c} (66%)
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-mod3.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-noreturn.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-optional-off.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-optional-on.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-optional-static-off.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-optional-static-on.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-override-static.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-override.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared-disabled.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared-enforced-abort.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared-optional.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared-override.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared.c
 create mode 100644 sysdeps/unix/sysv/linux/aarch64/tst-gcs-skeleton.c
 create mode 100644 sysdeps/x86/tst-gnu2-tls2-x86-noxsave.c
 create mode 100644 sysdeps/x86/tst-gnu2-tls2-x86-noxsavec.c
 create mode 100644 sysdeps/x86/tst-gnu2-tls2-x86-noxsavexsavec.c
 create mode 100644 sysdeps/x86_64/fpu/multiarch/e_atanh-fma.c
 create mode 100644 sysdeps/x86_64/fpu/multiarch/e_atanh.c
 create mode 100644 sysdeps/x86_64/fpu/multiarch/e_sinh-fma.c
 create mode 100644 sysdeps/x86_64/fpu/multiarch/e_sinh.c
 create mode 100644 sysdeps/x86_64/fpu/multiarch/s_tanh-fma.c
 create mode 100644 sysdeps/x86_64/fpu/multiarch/s_tanh.c

diff --git a/ADVISORIES b/ADVISORIES
new file mode 100644
index 000000000..d4e33f2df
--- /dev/null
+++ b/ADVISORIES
@@ -0,0 +1,2 @@
+For the GNU C Library Security Advisories, see the git master branch:
+https://sourceware.org/git/?p=glibc.git;a=tree;f=advisories;hb=HEAD
diff --git a/Makeconfig b/Makeconfig
index d0108d2ca..aa547a443 100644
--- a/Makeconfig
+++ b/Makeconfig
@@ -633,7 +633,7 @@ link-libc-printers-tests = $(link-libc-rpath) \
 			   $(link-libc-tests-after-rpath-link)
 
 # This is how to find at build-time things that will be installed there.
-rpath-dirs = math elf dlfcn nss nis rt resolv mathvec support
+rpath-dirs = math elf dlfcn nss nis rt resolv mathvec support misc
 rpath-link = \
 $(common-objdir):$(subst $(empty) ,:,$(patsubst ../$(subdir),.,$(rpath-dirs:%=$(common-objpfx)%)))
 else  # build-static
diff --git a/NEWS b/NEWS
index b11422b06..f77d1471c 100644
--- a/NEWS
+++ b/NEWS
@@ -5,6 +5,39 @@ See the end for copying conditions.
 Please send GNU C library bug reports via <https://sourceware.org/bugzilla/>
 using `glibc' in the "product" field.
 
+Version 2.41.1
+
+Deprecated and removed features, and other changes affecting compatibility:
+
+* The glibc.rtld.execstack now supports a compatibility mode to allow
+  programs that require an executable stack through dynamic loaded
+  shared libraries.
+
+The following bugs were resolved with this release:
+
+  [31943] _dl_find_object can fail if ld.so contains gaps between load segments
+  [32269] RISC-V IFUNC resolver cannot access gp pointer
+  [32626] math: math: log10p1f is not correctly rounded
+  [32627] math: math: sinhf is not correctly rounded
+  [32630] math: math: tanf is not correctly rounded for all rounding
+    modes
+  [32653] dynamic-link: Review options for improving both security and
+    backwards compatibility of glibc 2.41 dlopen / execstack handling
+  [32781] Linux: Remove attribute access from sched_getattr
+  [32782] nptl: Race conditions in pthread cancellation causing crash
+  [32786] nptl: PTHREAD_COND_INITIALIZER compatibility with pre-2.41 versions
+  [32810] Crash on x86-64 if XSAVEC disable via tunable
+  [32882] tst-audit10 fails with SIGILL on CPUs without AVX
+  [32897] dynamic-link: pthread_getattr_np fails when executable stack
+    tunable is set
+  [32981] ports: elf/tst-execstack-prog-static-tunable fails on
+    sparc64-linux-gnu
+  [32987] elf: Fix subprocess status handling for tst-dlopen-sgid
+  [32994] stdlib: resolve a double lock init issue after fork
+  [33164] iconv -o should not create executable files
+  [33185] Fix double-free after allocation failure in regcomp
+  [33245] nptl: nptl: error in internal cancellation syscall handling
+
 Version 2.41
 
 Major new features:
diff --git a/advisories/GLIBC-SA-2023-0001 b/advisories/GLIBC-SA-2023-0001
deleted file mode 100644
index 3d19c91b6..000000000
--- a/advisories/GLIBC-SA-2023-0001
+++ /dev/null
@@ -1,14 +0,0 @@
-printf: incorrect output for integers with thousands separator and width field
-
-When the printf family of functions is called with a format specifier
-that uses an <apostrophe> (enable grouping) and a minimum width
-specifier, the resulting output could be larger than reasonably expected
-by a caller that computed a tight bound on the buffer size.  The
-resulting larger than expected output could result in a buffer overflow
-in the printf family of functions.
-
-CVE-Id: CVE-2023-25139
-Public-Date: 2023-02-02
-Vulnerable-Commit: e88b9f0e5cc50cab57a299dc7efe1a4eb385161d (2.37)
-Fix-Commit: c980549cc6a1c03c23cc2fe3e7b0fe626a0364b0 (2.38)
-Fix-Commit: 07b9521fc6369d000216b96562ff7c0ed32a16c4 (2.37-4)
diff --git a/advisories/GLIBC-SA-2023-0002 b/advisories/GLIBC-SA-2023-0002
deleted file mode 100644
index 5122669a6..000000000
--- a/advisories/GLIBC-SA-2023-0002
+++ /dev/null
@@ -1,15 +0,0 @@
-getaddrinfo: Stack read overflow in no-aaaa mode
-
-If the system is configured in no-aaaa mode via /etc/resolv.conf,
-getaddrinfo is called for the AF_UNSPEC address family, and a DNS
-response is received over TCP that is larger than 2048 bytes,
-getaddrinfo may potentially disclose stack contents via the returned
-address data, or crash.
-
-CVE-Id: CVE-2023-4527
-Public-Date: 2023-09-12
-Vulnerable-Commit: f282cdbe7f436c75864e5640a409a10485e9abb2 (2.36)
-Fix-Commit: bd77dd7e73e3530203be1c52c8a29d08270cb25d (2.39)
-Fix-Commit: 4ea972b7edd7e36610e8cde18bf7a8149d7bac4f (2.36-113)
-Fix-Commit: b7529346025a130fee483d42178b5c118da971bb (2.37-38)
-Fix-Commit: b25508dd774b617f99419bdc3cf2ace4560cd2d6 (2.38-19)
diff --git a/advisories/GLIBC-SA-2023-0003 b/advisories/GLIBC-SA-2023-0003
deleted file mode 100644
index d3aef8034..000000000
--- a/advisories/GLIBC-SA-2023-0003
+++ /dev/null
@@ -1,15 +0,0 @@
-getaddrinfo: Potential use-after-free
-
-When an NSS plugin only implements the _gethostbyname2_r and
-_getcanonname_r callbacks, getaddrinfo could use memory that was freed
-during buffer resizing, potentially causing a crash or read or write to
-arbitrary memory.
-
-CVE-Id: CVE-2023-4806
-Public-Date: 2023-09-12
-Fix-Commit: 973fe93a5675c42798b2161c6f29c01b0e243994 (2.39)
-Fix-Commit: e09ee267c03e3150c2c9ba28625ab130705a485e (2.34-420)
-Fix-Commit: e3ccb230a961b4797510e6a1f5f21fd9021853e7 (2.35-270)
-Fix-Commit: a9728f798ec7f05454c95637ee6581afaa9b487d (2.36-115)
-Fix-Commit: 6529a7466c935f36e9006b854d6f4e1d4876f942 (2.37-39)
-Fix-Commit: 00ae4f10b504bc4564e9f22f00907093f1ab9338 (2.38-20)
diff --git a/advisories/GLIBC-SA-2023-0004 b/advisories/GLIBC-SA-2023-0004
deleted file mode 100644
index 5286a7aa5..000000000
--- a/advisories/GLIBC-SA-2023-0004
+++ /dev/null
@@ -1,16 +0,0 @@
-tunables: local privilege escalation through buffer overflow
-
-If a tunable of the form NAME=NAME=VAL is passed in the environment of a
-setuid program and NAME is valid, it may result in a buffer overflow,
-which could be exploited to achieve escalated privileges.  This flaw was
-introduced in glibc 2.34.
-
-CVE-Id: CVE-2023-4911
-Public-Date: 2023-10-03
-Vulnerable-Commit: 2ed18c5b534d9e92fc006202a5af0df6b72e7aca (2.34)
-Fix-Commit: 1056e5b4c3f2d90ed2b4a55f96add28da2f4c8fa (2.39)
-Fix-Commit: dcc367f148bc92e7f3778a125f7a416b093964d9 (2.34-423)
-Fix-Commit: c84018a05aec80f5ee6f682db0da1130b0196aef (2.35-274)
-Fix-Commit: 22955ad85186ee05834e47e665056148ca07699c (2.36-118)
-Fix-Commit: b4e23c75aea756b4bddc4abcf27a1c6dca8b6bd3 (2.37-45)
-Fix-Commit: 750a45a783906a19591fb8ff6b7841470f1f5701 (2.38-27)
diff --git a/advisories/GLIBC-SA-2023-0005 b/advisories/GLIBC-SA-2023-0005
deleted file mode 100644
index cc4eb90b8..000000000
--- a/advisories/GLIBC-SA-2023-0005
+++ /dev/null
@@ -1,18 +0,0 @@
-getaddrinfo: DoS due to memory leak
-
-The fix for CVE-2023-4806 introduced a memory leak when an application
-calls getaddrinfo for AF_INET6 with AI_CANONNAME, AI_ALL and AI_V4MAPPED
-flags set.
-
-CVE-Id: CVE-2023-5156
-Public-Date: 2023-09-25
-Vulnerable-Commit: e09ee267c03e3150c2c9ba28625ab130705a485e (2.34-420)
-Vulnerable-Commit: e3ccb230a961b4797510e6a1f5f21fd9021853e7 (2.35-270)
-Vulnerable-Commit: a9728f798ec7f05454c95637ee6581afaa9b487d (2.36-115)
-Vulnerable-Commit: 6529a7466c935f36e9006b854d6f4e1d4876f942 (2.37-39)
-Vulnerable-Commit: 00ae4f10b504bc4564e9f22f00907093f1ab9338 (2.38-20)
-Fix-Commit: 8006457ab7e1cd556b919f477348a96fe88f2e49 (2.34-421)
-Fix-Commit: 17092c0311f954e6f3c010f73ce3a78c24ac279a (2.35-272)
-Fix-Commit: 856bac55f98dc840e7c27cfa82262b933385de90 (2.36-116)
-Fix-Commit: 4473d1b87d04b25cdd0e0354814eeaa421328268 (2.37-42)
-Fix-Commit: 5ee59ca371b99984232d7584fe2b1a758b4421d3 (2.38-24)
diff --git a/advisories/GLIBC-SA-2024-0001 b/advisories/GLIBC-SA-2024-0001
deleted file mode 100644
index 28931c75a..000000000
--- a/advisories/GLIBC-SA-2024-0001
+++ /dev/null
@@ -1,15 +0,0 @@
-syslog: Heap buffer overflow in __vsyslog_internal
-
-__vsyslog_internal did not handle a case where printing a SYSLOG_HEADER
-containing a long program name failed to update the required buffer
-size, leading to the allocation and overflow of a too-small buffer on
-the heap.
-
-CVE-Id: CVE-2023-6246
-Public-Date: 2024-01-30
-Vulnerable-Commit: 52a5be0df411ef3ff45c10c7c308cb92993d15b1 (2.37)
-Fix-Commit: 6bd0e4efcc78f3c0115e5ea9739a1642807450da (2.39)
-Fix-Commit: 23514c72b780f3da097ecf33a793b7ba9c2070d2 (2.38-42)
-Fix-Commit: 97a4292aa4a2642e251472b878d0ec4c46a0e59a (2.37-57)
-Vulnerable-Commit: b0e7888d1fa2dbd2d9e1645ec8c796abf78880b9 (2.36-16)
-Fix-Commit: d1a83b6767f68b3cb5b4b4ea2617254acd040c82 (2.36-126)
diff --git a/advisories/GLIBC-SA-2024-0002 b/advisories/GLIBC-SA-2024-0002
deleted file mode 100644
index 940bfcf2f..000000000
--- a/advisories/GLIBC-SA-2024-0002
+++ /dev/null
@@ -1,15 +0,0 @@
-syslog: Heap buffer overflow in __vsyslog_internal
-
-__vsyslog_internal used the return value of snprintf/vsnprintf to
-calculate buffer sizes for memory allocation.  If these functions (for
-any reason) failed and returned -1, the resulting buffer would be too
-small to hold output.
-
-CVE-Id: CVE-2023-6779
-Public-Date: 2024-01-30
-Vulnerable-Commit: 52a5be0df411ef3ff45c10c7c308cb92993d15b1 (2.37)
-Fix-Commit: 7e5a0c286da33159d47d0122007aac016f3e02cd (2.39)
-Fix-Commit: d0338312aace5bbfef85e03055e1212dd0e49578 (2.38-43)
-Fix-Commit: 67062eccd9a65d7fda9976a56aeaaf6c25a80214 (2.37-58)
-Vulnerable-Commit: b0e7888d1fa2dbd2d9e1645ec8c796abf78880b9 (2.36-16)
-Fix-Commit: 2bc9d7c002bdac38b5c2a3f11b78e309d7765b83 (2.36-127)
diff --git a/advisories/GLIBC-SA-2024-0003 b/advisories/GLIBC-SA-2024-0003
deleted file mode 100644
index b43a5150a..000000000
--- a/advisories/GLIBC-SA-2024-0003
+++ /dev/null
@@ -1,13 +0,0 @@
-syslog: Integer overflow in __vsyslog_internal
-
-__vsyslog_internal calculated a buffer size by adding two integers, but
-did not first check if the addition would overflow.
-
-CVE-Id: CVE-2023-6780
-Public-Date: 2024-01-30
-Vulnerable-Commit: 52a5be0df411ef3ff45c10c7c308cb92993d15b1 (2.37)
-Fix-Commit: ddf542da94caf97ff43cc2875c88749880b7259b (2.39)
-Fix-Commit: d37c2b20a4787463d192b32041c3406c2bd91de0 (2.38-44)
-Fix-Commit: 2b58cba076e912961ceaa5fa58588e4b10f791c0 (2.37-59)
-Vulnerable-Commit: b0e7888d1fa2dbd2d9e1645ec8c796abf78880b9 (2.36-16)
-Fix-Commit: b9b7d6a27aa0632f334352fa400771115b3c69b7 (2.36-128)
diff --git a/advisories/GLIBC-SA-2024-0004 b/advisories/GLIBC-SA-2024-0004
deleted file mode 100644
index 08df2b311..000000000
--- a/advisories/GLIBC-SA-2024-0004
+++ /dev/null
@@ -1,28 +0,0 @@
-ISO-2022-CN-EXT: fix out-of-bound writes when writing escape sequence
-
-The iconv() function in the GNU C Library versions 2.39 and older may 
-overflow the output buffer passed to it by up to 4 bytes when converting 
-strings to the ISO-2022-CN-EXT character set, which may be used to 
-crash an application or overwrite a neighbouring variable.
-
-ISO-2022-CN-EXT uses escape sequences to indicate character set changes
-(as specified by RFC 1922).  While the SOdesignation has the expected
-bounds checks, neither SS2designation nor SS3designation have its;
-allowing a write overflow of 1, 2, or 3 bytes with fixed values:
-'$+I', '$+J', '$+K', '$+L', '$+M', or '$*H'.
-
-CVE-Id: CVE-2024-2961
-Public-Date: 2024-04-17
-Vulnerable-Commit: 755104edc75c53f4a0e7440334e944ad3c6b32fc (2.1.93-169)
-Fix-Commit: f9dc609e06b1136bb0408be9605ce7973a767ada (2.40)
-Fix-Commit: 31da30f23cddd36db29d5b6a1c7619361b271fb4 (2.39-31)
-Fix-Commit: e1135387deded5d73924f6ca20c72a35dc8e1bda (2.38-66)
-Fix-Commit: 89ce64b269a897a7780e4c73a7412016381c6ecf (2.37-89)
-Fix-Commit: 4ed98540a7fd19f458287e783ae59c41e64df7b5 (2.36-164)
-Fix-Commit: 36280d1ce5e245aabefb877fe4d3c6cff95dabfa (2.35-315)
-Fix-Commit: a8b0561db4b9847ebfbfec20075697d5492a363c (2.34-459)
-Fix-Commit: ed4f16ff6bed3037266f1fa682ebd32a18fce29c (2.33-263)
-Fix-Commit: 682ad4c8623e611a971839990ceef00346289cc9 (2.32-140)
-Fix-Commit: 3703c32a8d304c1ee12126134ce69be965f38000 (2.31-154)
-
-Reported-By: Charles Fol
diff --git a/advisories/GLIBC-SA-2024-0005 b/advisories/GLIBC-SA-2024-0005
deleted file mode 100644
index a59596610..000000000
--- a/advisories/GLIBC-SA-2024-0005
+++ /dev/null
@@ -1,22 +0,0 @@
-nscd: Stack-based buffer overflow in netgroup cache
-
-If the Name Service Cache Daemon's (nscd) fixed size cache is exhausted
-by client requests then a subsequent client request for netgroup data
-may result in a stack-based buffer overflow.  This flaw was introduced
-in glibc 2.15 when the cache was added to nscd.
-
-This vulnerability is only present in the nscd binary.
-
-CVE-Id: CVE-2024-33599
-Public-Date: 2024-04-23
-Vulnerable-Commit: 684ae515993269277448150a1ca70db3b94aa5bd (2.15)
-Fix-Commit: 69c58d5ef9f584ea198bd00f7964d364d0e6b921 (2.31-155)
-Fix-Commit: a77064893bfe8a701770e2f53a4d33805bc47a5a (2.32-141)
-Fix-Commit: 5c75001a96abcd50cbdb74df24c3f013188d076e (2.33-264)
-Fix-Commit: 52f73e5c4e29b14e79167272297977f360ae1e97 (2.34-460)
-Fix-Commit: 7a95873543ce225376faf13bb71c43dea6d24f86 (2.35-316)
-Fix-Commit: caa3151ca460bdd9330adeedd68c3112d97bffe4 (2.36-165)
-Fix-Commit: f75c298e747b2b8b41b1c2f551c011a52c41bfd1 (2.37-91)
-Fix-Commit: 5968aebb86164034b8f8421b4abab2f837a5bdaf (2.38-72)
-Fix-Commit: 1263d583d2e28afb8be53f8d6922f0842036f35d (2.39-35)
-Fix-Commit: 87801a8fd06db1d654eea3e4f7626ff476a9bdaa (2.40)
diff --git a/advisories/GLIBC-SA-2024-0006 b/advisories/GLIBC-SA-2024-0006
deleted file mode 100644
index d44148d3d..000000000
--- a/advisories/GLIBC-SA-2024-0006
+++ /dev/null
@@ -1,32 +0,0 @@
-nscd: Null pointer crash after notfound response
-
-If the Name Service Cache Daemon's (nscd) cache fails to add a not-found
-netgroup response to the cache, the client request can result in a null
-pointer dereference.  This flaw was introduced in glibc 2.15 when the
-cache was added to nscd.
-
-This vulnerability is only present in the nscd binary.
-
-CVE-Id: CVE-2024-33600
-Public-Date: 2024-04-24
-Vulnerable-Commit: 684ae515993269277448150a1ca70db3b94aa5bd (2.15)
-Fix-Commit: b048a482f088e53144d26a61c390bed0210f49f2 (2.40)
-Fix-Commit: 7835b00dbce53c3c87bbbb1754a95fb5e58187aa (2.40)
-Fix-Commit: c99f886de54446cd4447db6b44be93dabbdc2f8b (2.39-37)
-Fix-Commit: 5a508e0b508c8ad53bd0d2fb48fd71b242626341 (2.39-36)
-Fix-Commit: 2ae9446c1b7a3064743b4a51c0bbae668ee43e4c (2.38-74)
-Fix-Commit: 541ea5172aa658c4bd5c6c6d6fd13903c3d5bb0a (2.38-73)
-Fix-Commit: a8070b31043c7585c36ba68a74298c4f7af075c3 (2.37-93)
-Fix-Commit: 5eea50c4402e39588de98aa1d4469a79774703d4 (2.37-92)
-Fix-Commit: f205b3af56740e3b014915b1bd3b162afe3407ef (2.36-167)
-Fix-Commit: c34f470a615b136170abd16142da5dd0c024f7d1 (2.36-166)
-Fix-Commit: bafadc589fbe21ae330e8c2af74db9da44a17660 (2.35-318)
-Fix-Commit: 4370bef52b0f3f3652c6aa13d7a9bb3ac079746d (2.35-317)
-Fix-Commit: 1f94122289a9bf7dba573f5d60327aaa2b85cf2e (2.34-462)
-Fix-Commit: 966d6ac9e40222b84bb21674cc4f83c8d72a5a26 (2.34-461)
-Fix-Commit: e3eef1b8fbdd3a7917af466ca9c4b7477251ca79 (2.33-266)
-Fix-Commit: f20a8d696b13c6261b52a6434899121f8b19d5a7 (2.33-265)
-Fix-Commit: be602180146de37582a3da3a0caa4b719645de9c (2.32-143)
-Fix-Commit: 394eae338199078b7961b051c191539870742d7b (2.32-142)
-Fix-Commit: 8d7949183760170c61e55def723c1d8050187874 (2.31-157)
-Fix-Commit: 304ce5fe466c4762b21b36c26926a4657b59b53e (2.31-156)
diff --git a/advisories/GLIBC-SA-2024-0007 b/advisories/GLIBC-SA-2024-0007
deleted file mode 100644
index b6928fa27..000000000
--- a/advisories/GLIBC-SA-2024-0007
+++ /dev/null
@@ -1,28 +0,0 @@
-nscd: netgroup cache may terminate daemon on memory allocation failure
-
-The Name Service Cache Daemon's (nscd) netgroup cache uses xmalloc or
-xrealloc and these functions may terminate the process due to a memory
-allocation failure resulting in a denial of service to the clients.  The
-flaw was introduced in glibc 2.15 when the cache was added to nscd.
-
-This vulnerability is only present in the nscd binary.
-
-Subsequent refactoring of the netgroup cache only added more uses of
-xmalloc and xrealloc. Uses of xmalloc and xrealloc in other parts of
-nscd only occur during startup of the daemon and so are not affected by
-client requests that could trigger an out of memory followed by
-termination.
-
-CVE-Id: CVE-2024-33601
-Public-Date: 2024-04-24
-Vulnerable-Commit: 684ae515993269277448150a1ca70db3b94aa5bd (2.15)
-Fix-Commit: c04a21e050d64a1193a6daab872bca2528bda44b (2.40)
-Fix-Commit: a9a8d3eebb145779a18d90e3966009a1daa63cd8 (2.39-38)
-Fix-Commit: 71af8ca864345d39b746d5cee84b94b430fad5db (2.38-75)
-Fix-Commit: 6e106dc214d6a033a4e945d1c6cf58061f1c5f1f (2.37-94)
-Fix-Commit: b6742463694b1dfdd5120b91ee21cf05d15ec2e2 (2.36-168)
-Fix-Commit: 7a5864cac60e06000394128a5a2817b03542f5a3 (2.35-319)
-Fix-Commit: 86f1d5f4129c373ac6fb6df5bcf38273838843cb (2.34-463)
-Fix-Commit: 4d27d4b9a188786fc6a56745506cec2acfc51f83 (2.33-267)
-Fix-Commit: 3ed195a8ec89da281e3c4bf887a13d281b72d8f4 (2.32-144)
-Fix-Commit: bbf5a58ccb55679217f94de706164d15372fbbc0 (2.31-158)
diff --git a/advisories/GLIBC-SA-2024-0008 b/advisories/GLIBC-SA-2024-0008
deleted file mode 100644
index d93e2a6f0..000000000
--- a/advisories/GLIBC-SA-2024-0008
+++ /dev/null
@@ -1,26 +0,0 @@
-nscd: netgroup cache assumes NSS callback uses in-buffer strings
-
-The Name Service Cache Daemon's (nscd) netgroup cache can corrupt memory
-when the NSS callback does not store all strings in the provided buffer.
-The flaw was introduced in glibc 2.15 when the cache was added to nscd.
-
-This vulnerability is only present in the nscd binary.
-
-There is no guarantee from the NSS callback API that the returned
-strings are all within the buffer. However, the netgroup cache code
-assumes that the NSS callback uses in-buffer strings and if it doesn't
-the buffer resizing logic could lead to potential memory corruption.
-
-CVE-Id: CVE-2024-33602
-Public-Date: 2024-04-24
-Vulnerable-Commit: 684ae515993269277448150a1ca70db3b94aa5bd (2.15)
-Fix-Commit: c04a21e050d64a1193a6daab872bca2528bda44b (2.40)
-Fix-Commit: a9a8d3eebb145779a18d90e3966009a1daa63cd8 (2.39-38)
-Fix-Commit: 71af8ca864345d39b746d5cee84b94b430fad5db (2.38-75)
-Fix-Commit: 6e106dc214d6a033a4e945d1c6cf58061f1c5f1f (2.37-94)
-Fix-Commit: b6742463694b1dfdd5120b91ee21cf05d15ec2e2 (2.36-168)
-Fix-Commit: 7a5864cac60e06000394128a5a2817b03542f5a3 (2.35-319)
-Fix-Commit: 86f1d5f4129c373ac6fb6df5bcf38273838843cb (2.34-463)
-Fix-Commit: 4d27d4b9a188786fc6a56745506cec2acfc51f83 (2.33-267)
-Fix-Commit: 3ed195a8ec89da281e3c4bf887a13d281b72d8f4 (2.32-144)
-Fix-Commit: bbf5a58ccb55679217f94de706164d15372fbbc0 (2.31-158)
diff --git a/advisories/GLIBC-SA-2025-0001 b/advisories/GLIBC-SA-2025-0001
deleted file mode 100644
index 45f8b8f18..000000000
--- a/advisories/GLIBC-SA-2025-0001
+++ /dev/null
@@ -1,25 +0,0 @@
-assert: Buffer overflow when printing assertion failure message
-
-When the assert() function fails, it does not allocate enough space for the
-assertion failure message string and size information, which may lead to a
-buffer overflow if the message string size aligns to page size.
-
-This bug can be triggered when an assertion in a program fails.  The assertion
-failure message is allocated to allow developers to see this failure in core
-dumps and it typically includes, in addition to the invariant assertion
-string and function name, the name of the program.  If the name of the failing
-program is user controlled, for example on a local system, this could allow an
-attacker to control the assertion failure to trigger this buffer overflow.
-
-The only viable vector for exploitation of this bug is local, if a setuid
-program exists that has an existing bug that results in an assertion failure.
-No such program has been discovered at the time of publishing this advisory,
-but the presence of custom setuid programs, although strongly discouraged as a
-security practice, cannot be discounted.
-
-CVE-Id: CVE-2025-0395
-Public-Date: 2025-01-22
-Vulnerable-Commit: f8a3b5bf8fa1d0c43d2458e03cc109a04fdef194 (2.13-175)
-Fix-Commit: 68ee0f704cb81e9ad0a78c644a83e1e9cd2ee578 (2.41)
-Fix-Commit: 7d4b6bcae91f29d7b4daf15bab06b66cf1d2217c (2.40-66)
-Reported-By: Qualys Security Advisory
diff --git a/advisories/README b/advisories/README
deleted file mode 100644
index b8f8a829c..000000000
--- a/advisories/README
+++ /dev/null
@@ -1,77 +0,0 @@
-GNU C Library Security Advisory Format
-======================================
-
-Security advisories in this directory follow a simple git commit log
-format, with a heading and free-format description augmented with tags
-to allow parsing key information.  References to code changes are
-specific to the glibc repository and follow a specific format:
-
-  Tag-name: <commit-ref> (release-version)
-
-The <commit-ref> indicates a specific commit in the repository.  The
-release-version indicates the publicly consumable release in which this
-commit is known to exist.  The release-version is derived from the
-git-describe format, (i.e. stripped out from glibc-2.34.NNN-gxxxx) and
-is of the form 2.34-NNN.  If the -NNN suffix is absent, it means that
-the change is in that release tarball, otherwise the change is on the
-release/2.YY/master branch and not in any released tarball.
-
-The following tags are currently being used:
-
-CVE-Id:
-This is the CVE-Id assigned under the CVE Program
-(https://www.cve.org/).
-
-Public-Date:
-The date this issue became publicly known.
-
-Vulnerable-Commit:
-The commit that introduced this vulnerability.  There could be multiple
-entries, one for each release branch in the glibc repository; the
-release-version portion of this tag should tell you which branch this is
-on.
-
-Fix-Commit:
-The commit that fixed this vulnerability.  There could be multiple
-entries for each release branch in the glibc repository, indicating that
-all of those commits contributed to fixing that issue in each of those
-branches.
-
-Reported-By:
-The entity that reported this issue. There could be multiple entries, one for
-each reporter.
-
-Adding an Advisory
-------------------
-
-An advisory for a CVE needs to be added on the master branch in two steps:
-
-1. Add the text of the advisory without any Fix-Commit tags along with
-   the fix for the CVE.  Add the Vulnerable-Commit tag, if applicable.
-   The advisories directory does not exist in release branches, so keep
-   the advisory text commit distinct from the code changes, to ease
-   backports.  Ask for the GLIBC-SA advisory number from the security
-   team.
-
-2. Finish all backports on release branches and then back on the msater
-   branch, add all commit refs to the advisory using the Fix-Commit
-   tags.  Don't bother adding the release-version subscript since the
-   next step will overwrite it.
-
-3. Run the process-advisories.sh script in the scripts directory on the
-   advisory:
-
-     scripts/process-advisories.sh update GLIBC-SA-YYYY-NNNN
-
-   (replace YYYY-NNNN with the actual advisory number).
-
-4. Verify the updated advisory and push the result.
-
-Getting a NEWS snippet from advisories
---------------------------------------
-
-Run:
-
-  scripts/process-advisories.sh news
-
-and copy the content into the NEWS file.
diff --git a/assert/Makefile b/assert/Makefile
index 65b9d0768..8d106d875 100644
--- a/assert/Makefile
+++ b/assert/Makefile
@@ -39,6 +39,7 @@ tests := \
   test-assert-perr \
   tst-assert-c++ \
   tst-assert-g++ \
+  tst-assert-sa-2025-0001 \
   # tests
 
 ifeq ($(have-cxx-thread_local),yes)
diff --git a/assert/tst-assert-sa-2025-0001.c b/assert/tst-assert-sa-2025-0001.c
new file mode 100644
index 000000000..102cb0078
--- /dev/null
+++ b/assert/tst-assert-sa-2025-0001.c
@@ -0,0 +1,92 @@
+/* Test for CVE-2025-0395.
+   Copyright The GNU Toolchain Authors.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* Test that a large enough __progname does not result in a buffer overflow
+   when printing an assertion failure.  This was CVE-2025-0395.  */
+#include <assert.h>
+#include <inttypes.h>
+#include <signal.h>
+#include <stdbool.h>
+#include <string.h>
+#include <sys/mman.h>
+#include <support/check.h>
+#include <support/support.h>
+#include <support/xstdio.h>
+#include <support/xunistd.h>
+
+extern const char *__progname;
+
+int
+do_test (int argc, char **argv)
+{
+
+  support_need_proc ("Reads /proc/self/maps to add guards to writable maps.");
+  ignore_stderr ();
+
+  /* XXX assumes that the assert is on a 2 digit line number.  */
+  const char *prompt = ": %s:99: do_test: Assertion `argc < 1' failed.\n";
+
+  int ret = fprintf (stderr, prompt, __FILE__);
+  if (ret < 0)
+    FAIL_EXIT1 ("fprintf failed: %m\n");
+
+  size_t pagesize = getpagesize ();
+  size_t namesize = pagesize - 1 - ret;
+
+  /* Alter the progname so that the assert message fills the entire page.  */
+  char progname[namesize];
+  memset (progname, 'A', namesize - 1);
+  progname[namesize - 1] = '\0';
+  __progname = progname;
+
+  FILE *f = xfopen ("/proc/self/maps", "r");
+  char *line = NULL;
+  size_t len = 0;
+  uintptr_t prev_to = 0;
+
+  /* Pad the beginning of every writable mapping with a PROT_NONE map.  This
+     ensures that the mmap in the assert_fail path never ends up below a
+     writable map and will terminate immediately in case of a buffer
+     overflow.  */
+  while (xgetline (&line, &len, f))
+    {
+      uintptr_t from, to;
+      char perm[4];
+
+      sscanf (line, "%" SCNxPTR "-%" SCNxPTR " %c%c%c%c ",
+	      &from, &to,
+	      &perm[0], &perm[1], &perm[2], &perm[3]);
+
+      bool writable = (memchr (perm, 'w', 4) != NULL);
+
+      if (prev_to != 0 && from - prev_to > pagesize && writable)
+	xmmap ((void *) from - pagesize, pagesize, PROT_NONE,
+	       MAP_ANONYMOUS | MAP_PRIVATE, 0);
+
+      prev_to = to;
+    }
+
+  xfclose (f);
+
+  assert (argc < 1);
+  return 0;
+}
+
+#define EXPECTED_SIGNAL SIGABRT
+#define TEST_FUNCTION_ARGV do_test
+#include <support/test-driver.c>
diff --git a/benchtests/atanh-inputs b/benchtests/atanh-inputs
index 455aa65b6..498529325 100644
--- a/benchtests/atanh-inputs
+++ b/benchtests/atanh-inputs
@@ -1,6 +1,7 @@
 ## args: double
 ## ret: double
 ## includes: math.h
+## name: workload-random
 0x1.5a2730bacd94ap-1
 -0x1.b57eb40fc048ep-21
 -0x1.c0b185fb450e2p-17
diff --git a/benchtests/sinh-inputs b/benchtests/sinh-inputs
index 7b1ac46a3..2fcb2fabf 100644
--- a/benchtests/sinh-inputs
+++ b/benchtests/sinh-inputs
@@ -1,6 +1,7 @@
 ## args: double
 ## ret: double
 ## includes: math.h
+## name: workload-random
 0x1.bcb6129b5ff2bp8
 -0x1.63057386325ebp9
 0x1.62f1d7dc4e8bfp9
diff --git a/config.make.in b/config.make.in
index 36096881b..59897eaec 100644
--- a/config.make.in
+++ b/config.make.in
@@ -53,6 +53,7 @@ c++-bits-std_abs-h = @CXX_BITS_STD_ABS_H@
 enable-werror = @enable_werror@
 
 have-z-execstack = @libc_cv_z_execstack@
+have-no-error-execstack = @libc_cv_no_error_execstack@
 have-protected-data = @libc_cv_protected_data@
 have-insert = @libc_cv_insert@
 have-glob-dat-reloc = @libc_cv_has_glob_dat@
diff --git a/configure b/configure
index eb8abd005..674d1d7e4 100755
--- a/configure
+++ b/configure
@@ -659,6 +659,7 @@ libc_cv_has_glob_dat
 libc_cv_fpie
 libc_cv_test_static_pie
 libc_cv_z_execstack
+libc_cv_no_error_execstack
 ASFLAGS_config
 libc_cv_cc_with_libunwind
 libc_cv_insert
@@ -7114,6 +7115,40 @@ if test $libc_cv_as_noexecstack = yes; then
 fi
 
 
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for linker that supports --no-error-execstack" >&5
+printf %s "checking for linker that supports --no-error-execstack... " >&6; }
+libc_linker_feature=no
+cat > conftest.c <<EOF
+int _start (void) { return 42; }
+EOF
+if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS $no_ssp
+		  -Wl,--no-error-execstack -nostdlib -nostartfiles
+		  -fPIC -shared -o conftest.so conftest.c
+		  1>&5'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }
+then
+  if ${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS $no_ssp -Wl,--no-error-execstack -nostdlib \
+      -nostartfiles -fPIC -shared -o conftest.so conftest.c 2>&1 \
+      | grep "warning: --no-error-execstack ignored" > /dev/null 2>&1; then
+    true
+  else
+    libc_linker_feature=yes
+  fi
+fi
+rm -f conftest*
+if test $libc_linker_feature = yes; then
+  libc_cv_no_error_execstack=yes
+else
+  libc_cv_no_error_execstack=no
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $libc_linker_feature" >&5
+printf "%s\n" "$libc_linker_feature" >&6; }
+
+
 { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for linker that supports -z execstack" >&5
 printf %s "checking for linker that supports -z execstack... " >&6; }
 libc_linker_feature=no
@@ -8643,6 +8678,35 @@ if test $libc_cv_builtin_trap = yes; then
 
 fi
 
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking whether the compiler supports __attribute__ ((aligned (65536)))" >&5
+printf %s "checking whether the compiler supports __attribute__ ((aligned (65536)))... " >&6; }
+if test ${libc_cv_aligned_65536+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e)
+cat confdefs.h - <<_ACEOF >conftest.$ac_ext
+/* end confdefs.h.  */
+
+char bss0xb5dce8 __attribute__ ((aligned (65536)));
+
+_ACEOF
+if ac_fn_c_try_compile "$LINENO"
+then :
+  libc_cv_aligned_65536=yes
+else case e in #(
+  e) libc_cv_aligned_65536=no ;;
+esac
+fi
+rm -f core conftest.err conftest.$ac_objext conftest.beam conftest.$ac_ext
+ ;;
+esac
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $libc_cv_aligned_65536" >&5
+printf "%s\n" "$libc_cv_aligned_65536" >&6; }
+config_vars="$config_vars
+aligned-65536 = $libc_cv_aligned_65536"
+
 ac_ext=cpp
 ac_cpp='$CXXCPP $CPPFLAGS'
 ac_compile='$CXX -c $CXXFLAGS $CPPFLAGS conftest.$ac_ext >&5'
@@ -8908,6 +8972,104 @@ printf "%s\n" "$libc_linker_feature" >&6; }
 config_vars="$config_vars
 load-address-ldflag = $libc_cv_load_address_ldflag"
 
+# Check if compilers support GCS in branch protection:
+
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking if compiler supports -mbranch-protection=gcs" >&5
+printf %s "checking if compiler supports -mbranch-protection=gcs... " >&6; }
+if test ${libc_cv_cc_gcs+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) if { ac_try='${CC-cc} -Werror -mbranch-protection=gcs -xc /dev/null -S -o /dev/null'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }
+then :
+  libc_cv_cc_gcs=yes
+else case e in #(
+  e) libc_cv_cc_gcs=no ;;
+esac
+fi ;;
+esac
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $libc_cv_cc_gcs" >&5
+printf "%s\n" "$libc_cv_cc_gcs" >&6; }
+if test "$TEST_CC" = "$CC"; then
+  libc_cv_test_cc_gcs=$libc_cv_cc_gcs
+else
+
+saved_CC="$CC"
+CC="$TEST_CC"
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking if compiler supports -mbranch-protection=gcs in testing" >&5
+printf %s "checking if compiler supports -mbranch-protection=gcs in testing... " >&6; }
+if test ${libc_cv_test_cc_gcs+y}
+then :
+  printf %s "(cached) " >&6
+else case e in #(
+  e) if { ac_try='${CC-cc} -Werror -mbranch-protection=gcs -xc /dev/null -S -o /dev/null'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }
+then :
+  libc_cv_test_cc_gcs=yes
+else case e in #(
+  e) libc_cv_test_cc_gcs=no ;;
+esac
+fi ;;
+esac
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $libc_cv_test_cc_gcs" >&5
+printf "%s\n" "$libc_cv_test_cc_gcs" >&6; }
+
+CC="$saved_CC"
+
+fi
+
+config_vars="$config_vars
+have-cc-gcs = $libc_cv_cc_gcs"
+config_vars="$config_vars
+have-test-cc-gcs = $libc_cv_test_cc_gcs"
+
+# Check if linker supports GCS marking
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking for linker that supports -z gcs=always" >&5
+printf %s "checking for linker that supports -z gcs=always... " >&6; }
+libc_linker_feature=no
+cat > conftest.c <<EOF
+int _start (void) { return 42; }
+EOF
+if { ac_try='${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS $no_ssp
+		  -Wl,-z,gcs=always -nostdlib -nostartfiles
+		  -fPIC -shared -o conftest.so conftest.c
+		  1>&5'
+  { { eval echo "\"\$as_me\":${as_lineno-$LINENO}: \"$ac_try\""; } >&5
+  (eval $ac_try) 2>&5
+  ac_status=$?
+  printf "%s\n" "$as_me:${as_lineno-$LINENO}: \$? = $ac_status" >&5
+  test $ac_status = 0; }; }
+then
+  if ${CC-cc} $CFLAGS $CPPFLAGS $LDFLAGS $no_ssp -Wl,-z,gcs=always -nostdlib \
+      -nostartfiles -fPIC -shared -o conftest.so conftest.c 2>&1 \
+      | grep "warning: -z gcs=always ignored" > /dev/null 2>&1; then
+    true
+  else
+    libc_linker_feature=yes
+  fi
+fi
+rm -f conftest*
+if test $libc_linker_feature = yes; then
+  libc_cv_ld_gcs=yes
+else
+  libc_cv_ld_gcs=no
+fi
+{ printf "%s\n" "$as_me:${as_lineno-$LINENO}: result: $libc_linker_feature" >&5
+printf "%s\n" "$libc_linker_feature" >&6; }
+config_vars="$config_vars
+have-ld-gcs = $libc_cv_ld_gcs"
+
 { printf "%s\n" "$as_me:${as_lineno-$LINENO}: checking if we can build programs as PIE" >&5
 printf %s "checking if we can build programs as PIE... " >&6; }
 cat confdefs.h - <<_ACEOF >conftest.$ac_ext
diff --git a/configure.ac b/configure.ac
index 050bfa65e..57cd24c87 100644
--- a/configure.ac
+++ b/configure.ac
@@ -1318,6 +1318,10 @@ if test $libc_cv_as_noexecstack = yes; then
 fi
 AC_SUBST(ASFLAGS_config)
 
+LIBC_LINKER_FEATURE([--no-error-execstack], [-Wl,--no-error-execstack],
+		    [libc_cv_no_error_execstack=yes], [libc_cv_no_error_execstack=no])
+AC_SUBST(libc_cv_no_error_execstack)
+
 LIBC_LINKER_FEATURE([-z execstack], [-Wl,-z,execstack],
 		    [libc_cv_z_execstack=yes], [libc_cv_z_execstack=no])
 AC_SUBST(libc_cv_z_execstack)
@@ -1820,6 +1824,17 @@ if test $libc_cv_builtin_trap = yes; then
   AC_DEFINE([HAVE_BUILTIN_TRAP])
 fi
 
+dnl Check if
+AC_CACHE_CHECK([whether the compiler supports __attribute__ ((aligned (65536)))],
+	       libc_cv_aligned_65536, [
+AC_COMPILE_IFELSE([AC_LANG_SOURCE([
+char bss[0xb5dce8] __attribute__ ((aligned (65536)));
+])],
+	       [libc_cv_aligned_65536=yes],
+	       [libc_cv_aligned_65536=no])
+])
+LIBC_CONFIG_VAR([aligned-65536], [$libc_cv_aligned_65536])
+
 dnl C++ feature tests.
 AC_LANG_PUSH([C++])
 
@@ -1992,6 +2007,23 @@ LIBC_LINKER_FEATURE([-Ttext-segment=$libc_cv_pde_load_address],
 		    [libc_cv_load_address_ldflag=])
 LIBC_CONFIG_VAR([load-address-ldflag], [$libc_cv_load_address_ldflag])
 
+# Check if compilers support GCS in branch protection:
+LIBC_TRY_CC_AND_TEST_CC_OPTION([if compiler supports -mbranch-protection=gcs],
+  [-Werror -mbranch-protection=gcs],
+  libc_cv_cc_gcs,
+  [libc_cv_cc_gcs=yes],
+  [libc_cv_cc_gcs=no],
+  libc_cv_test_cc_gcs,
+  [libc_cv_test_cc_gcs=yes],
+  [libc_cv_test_cc_gcs=no])
+LIBC_CONFIG_VAR([have-cc-gcs], [$libc_cv_cc_gcs])
+LIBC_CONFIG_VAR([have-test-cc-gcs], [$libc_cv_test_cc_gcs])
+
+# Check if linker supports GCS marking
+LIBC_LINKER_FEATURE([-z gcs=always], [-Wl,-z,gcs=always],
+		    [libc_cv_ld_gcs=yes], [libc_cv_ld_gcs=no])
+LIBC_CONFIG_VAR([have-ld-gcs], [$libc_cv_ld_gcs])
+
 AC_MSG_CHECKING(if we can build programs as PIE)
 AC_COMPILE_IFELSE([AC_LANG_SOURCE([[#ifdef PIE_UNSUPPORTED
 # error PIE is not supported
diff --git a/elf/Makefile b/elf/Makefile
index 4b1d0d874..b8064ef14 100644
--- a/elf/Makefile
+++ b/elf/Makefile
@@ -34,7 +34,6 @@ routines = \
   dl-addr \
   dl-addr-obj \
   dl-early_allocate \
-  dl-find_object \
   dl-iteratephdr \
   dl-libc \
   dl-origin \
@@ -61,6 +60,8 @@ dl-routines = \
   dl-deps \
   dl-exception \
   dl-execstack \
+  dl-execstack-tunable \
+  dl-find_object \
   dl-fini \
   dl-init \
   dl-load \
@@ -266,6 +267,7 @@ tests-static-normal := \
   tst-array1-static \
   tst-array5-static \
   tst-dl-iter-static \
+  tst-dlopen-sgid \
   tst-dst-static \
   tst-env-setuid-static \
   tst-getauxval-static \
@@ -379,6 +381,7 @@ tests += \
   tst-align3 \
   tst-audit-tlsdesc \
   tst-audit-tlsdesc-dlopen \
+  tst-audit-tlsdesc-dlopen2 \
   tst-audit1 \
   tst-audit2 \
   tst-audit8 \
@@ -532,6 +535,8 @@ tests-internal += \
   tst-dl_find_object-threads \
   tst-dlmopen2 \
   tst-hash-collision3 \
+  tst-link-map-contiguous-ldso \
+  tst-link-map-contiguous-libc \
   tst-ptrguard1 \
   tst-stackguard1 \
   tst-tls-surplus \
@@ -543,6 +548,10 @@ tests-internal += \
   unload2 \
   # tests-internal
 
+ifeq ($(build-hardcoded-path-in-tests),yes)
+tests-internal += tst-link-map-contiguous-main
+endif
+
 tests-container += \
   tst-dlopen-self-container \
   tst-dlopen-tlsmodid-container \
@@ -567,9 +576,11 @@ tests-execstack-yes = \
   tst-execstack \
   tst-execstack-needed \
   tst-execstack-prog \
+  tst-execstack-tunable \
   # tests-execstack-yes
 tests-execstack-static-yes = \
-  tst-execstack-prog-static
+  tst-execstack-prog-static \
+  tst-execstack-prog-static-tunable \
   # tests-execstack-static-yes
 ifeq (yes,$(run-built-tests))
 tests-execstack-special-yes = \
@@ -863,6 +874,7 @@ modules-names += \
   tst-auditmanymod8 \
   tst-auditmanymod9 \
   tst-auditmod-tlsdesc  \
+  tst-auditmod-tlsdesc2 \
   tst-auditmod1 \
   tst-auditmod11 \
   tst-auditmod12 \
@@ -905,6 +917,7 @@ modules-names += \
   tst-dlmopen1mod \
   tst-dlopen-auditdup-auditmod \
   tst-dlopen-auditdupmod \
+  tst-dlopen-sgid-mod \
   tst-dlopen-tlsreinitmod1 \
   tst-dlopen-tlsreinitmod2 \
   tst-dlopen-tlsreinitmod3 \
@@ -1144,6 +1157,10 @@ tests-pie += \
   tst-pie1 \
   tst-pie2 \
   # tests-pie
+ifeq (yes,$(aligned-65536))
+tests += tst-pie-bss
+tests-pie += tst-pie-bss
+endif
 ifneq (,$(load-address-ldflag))
 tests += \
   tst-pie-address \
@@ -1159,6 +1176,10 @@ tests += \
 tests-static += \
   tst-pie-address-static \
   # tests-static
+ifeq (yes,$(aligned-65536))
+tests += tst-pie-bss-static
+tests-static += tst-pie-bss-static
+endif
 LDFLAGS-tst-pie-address-static += \
   $(load-address-ldflag)=$(pde-load-address)
 endif
@@ -1988,6 +2009,9 @@ $(objpfx)tst-execstack.out: $(objpfx)tst-execstack-mod.so
 CPPFLAGS-tst-execstack.c += -DUSE_PTHREADS=0
 LDFLAGS-tst-execstack = -Wl,-z,noexecstack
 LDFLAGS-tst-execstack-mod.so = -Wl,-z,execstack
+ifeq ($(have-no-error-execstack),yes)
+LDFLAGS-tst-execstack-mod.so += -Wl,--no-error-execstack
+endif
 
 $(objpfx)tst-execstack-needed: $(objpfx)tst-execstack-mod.so
 LDFLAGS-tst-execstack-needed = -Wl,-z,noexecstack
@@ -1996,7 +2020,18 @@ LDFLAGS-tst-execstack-prog = -Wl,-z,execstack
 CFLAGS-tst-execstack-prog.c += -Wno-trampolines
 CFLAGS-tst-execstack-mod.c += -Wno-trampolines
 
+# It expects loading a module with executable stack to work.
+CFLAGS-tst-execstack-tunable.c += -DUSE_PTHREADS=0 -DDEFAULT_RWX_STACK=1
+$(objpfx)tst-execstack-tunable.out: $(objpfx)tst-execstack-mod.so
+tst-execstack-tunable-ENV = GLIBC_TUNABLES=glibc.rtld.execstack=2
+
+LDFLAGS-tst-execstack-prog-static-tunable = -Wl,-z,noexecstack
+tst-execstack-prog-static-tunable-ENV = GLIBC_TUNABLES=glibc.rtld.execstack=2
+
 LDFLAGS-tst-execstack-prog-static = -Wl,-z,execstack
+ifeq ($(have-no-error-execstack),yes)
+LDFLAGS-tst-execstack-prog-static += -Wl,--no-error-execstack
+endif
 CFLAGS-tst-execstack-prog-static.c += -Wno-trampolines
 
 ifeq (yes,$(build-hardcoded-path-in-tests))
@@ -2074,6 +2109,7 @@ $(objpfx)tst-array5-static-cmp.out: tst-array5-static.exp \
 
 CFLAGS-tst-pie1.c += $(pie-ccflag)
 CFLAGS-tst-pie2.c += $(pie-ccflag)
+CFLAGS-tst-pie-bss.c += $(pie-ccflag)
 CFLAGS-tst-pie-address.c += $(pie-ccflag)
 
 $(objpfx)tst-piemod1.so: $(libsupport)
@@ -3189,6 +3225,9 @@ $(objpfx)tst-audit-tlsdesc.out: $(objpfx)tst-auditmod-tlsdesc.so
 tst-audit-tlsdesc-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so
 $(objpfx)tst-audit-tlsdesc-dlopen.out: $(objpfx)tst-auditmod-tlsdesc.so
 tst-audit-tlsdesc-dlopen-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc.so
+$(objpfx)tst-audit-tlsdesc-dlopen2.out: $(objpfx)tst-auditmod-tlsdesc2.so \
+  $(patsubst %, $(objpfx)%.so, $(tlsmod17a-modules))
+tst-audit-tlsdesc-dlopen2-ENV = LD_AUDIT=$(objpfx)tst-auditmod-tlsdesc2.so
 
 $(objpfx)tst-dlmopen-twice.out: \
   $(objpfx)tst-dlmopen-twice-mod1.so \
@@ -3392,3 +3431,5 @@ $(objpfx)tst-nolink-libc-2: $(objpfx)tst-nolink-libc.o
 	  -Wl,--dynamic-linker=$(objpfx)ld.so
 $(objpfx)tst-nolink-libc-2.out: $(objpfx)tst-nolink-libc-2 $(objpfx)ld.so
 	$< > $@ 2>&1; $(evaluate-test)
+
+$(objpfx)tst-dlopen-sgid.out: $(objpfx)tst-dlopen-sgid-mod.so
diff --git a/elf/dl-execstack-tunable.c b/elf/dl-execstack-tunable.c
new file mode 100644
index 000000000..e3b638aea
--- /dev/null
+++ b/elf/dl-execstack-tunable.c
@@ -0,0 +1,39 @@
+/* Stack executability handling for GNU dynamic linker.
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <ldsodefs.h>
+#include <dl-tunables.h>
+
+void
+_dl_handle_execstack_tunable (void)
+{
+  switch (TUNABLE_GET (glibc, rtld, execstack, int32_t, NULL))
+    {
+    case stack_tunable_mode_disable:
+      if ((__glibc_unlikely (GL(dl_stack_flags)) & PF_X))
+	_dl_fatal_printf (
+"Fatal glibc error: executable stack is not allowed\n");
+      break;
+
+    case stack_tunable_mode_force:
+      if (_dl_make_stack_executable (__libc_stack_end) != 0)
+	_dl_fatal_printf (
+"Fatal glibc error: cannot enable executable stack as tunable requires");
+      break;
+    }
+}
diff --git a/elf/dl-execstack.c b/elf/dl-execstack.c
index e4d7dbe7f..ceec5b2de 100644
--- a/elf/dl-execstack.c
+++ b/elf/dl-execstack.c
@@ -23,7 +23,7 @@
    so as to mprotect it.  */
 
 int
-_dl_make_stack_executable (void **stack_endp)
+_dl_make_stack_executable (const void *stack_endp)
 {
   return ENOSYS;
 }
diff --git a/elf/dl-find_object.c b/elf/dl-find_object.c
index 513e46401..c9f4c1c8d 100644
--- a/elf/dl-find_object.c
+++ b/elf/dl-find_object.c
@@ -356,7 +356,7 @@ _dlfo_lookup (uintptr_t pc, struct dl_find_object_internal *first1, size_t size)
 }
 
 int
-__dl_find_object (void *pc1, struct dl_find_object *result)
+_dl_find_object (void *pc1, struct dl_find_object *result)
 {
   uintptr_t pc = (uintptr_t) pc1;
 
@@ -463,8 +463,38 @@ __dl_find_object (void *pc1, struct dl_find_object *result)
         return -1;
     } /* Transaction retry loop.  */
 }
-hidden_def (__dl_find_object)
-weak_alias (__dl_find_object, _dl_find_object)
+rtld_hidden_def (_dl_find_object)
+
+/* Subroutine of _dlfo_process_initial to split out noncontigous link
+   maps.  NODELETE is the number of used _dlfo_nodelete_mappings
+   elements.  It is incremented as needed, and the new NODELETE value
+   is returned.  */
+static size_t
+_dlfo_process_initial_noncontiguous_map (struct link_map *map,
+                                         size_t nodelete)
+{
+  struct dl_find_object_internal dlfo;
+  _dl_find_object_from_map (map, &dlfo);
+
+  /* PT_LOAD segments for a non-contiguous link map are added to the
+     non-closeable mappings.  */
+  const ElfW(Phdr) *ph = map->l_phdr;
+  const ElfW(Phdr) *ph_end = map->l_phdr + map->l_phnum;
+  for (; ph < ph_end; ++ph)
+    if (ph->p_type == PT_LOAD)
+      {
+        if (_dlfo_nodelete_mappings != NULL)
+          {
+            /* Second pass only.  */
+            _dlfo_nodelete_mappings[nodelete] = dlfo;
+            ElfW(Addr) start = ph->p_vaddr + map->l_addr;
+            _dlfo_nodelete_mappings[nodelete].map_start = start;
+            _dlfo_nodelete_mappings[nodelete].map_end = start + ph->p_memsz;
+          }
+        ++nodelete;
+      }
+  return nodelete;
+}
 
 /* _dlfo_process_initial is called twice.  First to compute the array
    sizes from the initial loaded mappings.  Second to fill in the
@@ -477,29 +507,8 @@ _dlfo_process_initial (void)
 
   size_t nodelete = 0;
   if (!main_map->l_contiguous)
-    {
-      struct dl_find_object_internal dlfo;
-      _dl_find_object_from_map (main_map, &dlfo);
-
-      /* PT_LOAD segments for a non-contiguous are added to the
-         non-closeable mappings.  */
-      for (const ElfW(Phdr) *ph = main_map->l_phdr,
-             *ph_end = main_map->l_phdr + main_map->l_phnum;
-           ph < ph_end; ++ph)
-        if (ph->p_type == PT_LOAD)
-          {
-            if (_dlfo_nodelete_mappings != NULL)
-              {
-                /* Second pass only.  */
-                _dlfo_nodelete_mappings[nodelete] = dlfo;
-                _dlfo_nodelete_mappings[nodelete].map_start
-                  = ph->p_vaddr + main_map->l_addr;
-                _dlfo_nodelete_mappings[nodelete].map_end
-                  = _dlfo_nodelete_mappings[nodelete].map_start + ph->p_memsz;
-              }
-            ++nodelete;
-          }
-    }
+    /* Contiguous case already handled in _dl_find_object_init.  */
+    nodelete = _dlfo_process_initial_noncontiguous_map (main_map, nodelete);
 
   size_t loaded = 0;
   for (Lmid_t ns = 0; ns < GL(dl_nns); ++ns)
@@ -511,11 +520,18 @@ _dlfo_process_initial (void)
           /* lt_library link maps are implicitly NODELETE.  */
           if (l->l_type == lt_library || l->l_nodelete_active)
             {
-              if (_dlfo_nodelete_mappings != NULL)
-                /* Second pass only.  */
-                _dl_find_object_from_map
-                  (l, _dlfo_nodelete_mappings + nodelete);
-              ++nodelete;
+              /* The kernel may have loaded ld.so with gaps.   */
+              if (!l->l_contiguous && is_rtld_link_map (l))
+                nodelete
+                  = _dlfo_process_initial_noncontiguous_map (l, nodelete);
+              else
+                {
+                  if (_dlfo_nodelete_mappings != NULL)
+                    /* Second pass only.  */
+                    _dl_find_object_from_map
+                      (l, _dlfo_nodelete_mappings + nodelete);
+                  ++nodelete;
+                }
             }
           else if (l->l_type == lt_loaded)
             {
@@ -765,7 +781,6 @@ _dl_find_object_update_1 (struct link_map **loaded, size_t count)
           /* Prefer newly loaded link map.  */
           assert (loaded_index1 > 0);
           _dl_find_object_from_map (loaded[loaded_index1 - 1], dlfo);
-          loaded[loaded_index1 -  1]->l_find_object_processed = 1;
           --loaded_index1;
         }
 
diff --git a/elf/dl-find_object.h b/elf/dl-find_object.h
index e433ff874..563af3de1 100644
--- a/elf/dl-find_object.h
+++ b/elf/dl-find_object.h
@@ -87,7 +87,7 @@ _dl_find_object_to_external (struct dl_find_object_internal *internal,
 }
 
 /* Extract the object location data from a link map and writes it to
-   *RESULT using relaxed MO stores.  */
+   *RESULT using relaxed MO stores.  Set L->l_find_object_processed.  */
 static void __attribute__ ((unused))
 _dl_find_object_from_map (struct link_map *l,
                           struct dl_find_object_internal *result)
@@ -100,6 +100,8 @@ _dl_find_object_from_map (struct link_map *l,
   atomic_store_relaxed (&result->eh_dbase, (void *) l->l_info[DT_PLTGOT]);
 #endif
 
+  l->l_find_object_processed = 1;
+
   for (const ElfW(Phdr) *ph = l->l_phdr, *ph_end = l->l_phdr + l->l_phnum;
        ph < ph_end; ++ph)
     if (ph->p_type == DLFO_EH_SEGMENT_TYPE)
diff --git a/elf/dl-load.c b/elf/dl-load.c
index f905578a6..945dd8a23 100644
--- a/elf/dl-load.c
+++ b/elf/dl-load.c
@@ -945,7 +945,7 @@ struct link_map *
 _dl_map_object_from_fd (const char *name, const char *origname, int fd,
 			struct filebuf *fbp, char *realname,
 			struct link_map *loader, int l_type, int mode,
-			void **stack_endp, Lmid_t nsid)
+			const void *stack_endp, Lmid_t nsid)
 {
   struct link_map *l = NULL;
   const ElfW(Ehdr) *header;
@@ -2180,7 +2180,7 @@ _dl_map_object (struct link_map *loader, const char *name,
 
   void *stack_end = __libc_stack_end;
   return _dl_map_object_from_fd (name, origname, fd, &fb, realname, loader,
-				 type, mode, &stack_end, nsid);
+				 type, mode, stack_end, nsid);
 }
 
 struct add_path_state
diff --git a/elf/dl-reloc-static-pie.c b/elf/dl-reloc-static-pie.c
index e34bf5f7c..758bf9893 100644
--- a/elf/dl-reloc-static-pie.c
+++ b/elf/dl-reloc-static-pie.c
@@ -51,7 +51,8 @@ _dl_relocate_static_pie (void)
     switch (ph->p_type)
       {
       case PT_LOAD:
-	if (ph->p_offset == 0)
+	/* Skip the empty PT_LOAD segment at offset 0.  */
+	if (ph->p_filesz != 0 && ph->p_offset == 0)
 	  file_p_vaddr = ph->p_vaddr;
 	break;
       case PT_DYNAMIC:
diff --git a/elf/dl-support.c b/elf/dl-support.c
index a7d5a5e8a..0388e2344 100644
--- a/elf/dl-support.c
+++ b/elf/dl-support.c
@@ -332,9 +332,7 @@ _dl_non_dynamic_init (void)
 	break;
       }
 
-  if ((__glibc_unlikely (GL(dl_stack_flags)) & PF_X)
-      && TUNABLE_GET (glibc, rtld, execstack, int32_t, NULL) == 0)
-    _dl_fatal_printf ("Fatal glibc error: executable stack is not allowed\n");
+  _dl_handle_execstack_tunable ();
 
   call_function_static_weak (_dl_find_object_init);
 
diff --git a/elf/dl-tls.c b/elf/dl-tls.c
index 8306a39e8..5686df5ad 100644
--- a/elf/dl-tls.c
+++ b/elf/dl-tls.c
@@ -560,6 +560,13 @@ _dl_resize_dtv (dtv_t *dtv, size_t max_modid)
       if (newp == NULL)
 	oom ();
       memcpy (newp, &dtv[-1], (2 + oldsize) * sizeof (dtv_t));
+#ifdef SHARED
+      /* Auditors can trigger a DTV resize event while the full malloc
+	 is not yet in use.  Mark the new DTV allocation as the
+	 initial allocation.  */
+      if (!__rtld_malloc_is_complete ())
+	GL(dl_initial_dtv) = &newp[1];
+#endif
     }
   else
     {
diff --git a/elf/dl-tunables.list b/elf/dl-tunables.list
index 0b6721bc5..c03c9967f 100644
--- a/elf/dl-tunables.list
+++ b/elf/dl-tunables.list
@@ -138,7 +138,7 @@ glibc {
     execstack {
       type: INT_32
       minval: 0
-      maxval: 1
+      maxval: 2
       default: 1
     }
   }
diff --git a/elf/rtld.c b/elf/rtld.c
index 00bec1531..c1e9721de 100644
--- a/elf/rtld.c
+++ b/elf/rtld.c
@@ -1242,6 +1242,60 @@ rtld_setup_main_map (struct link_map *main_map)
   return has_interp;
 }
 
+/* Set up the program header information for the dynamic linker
+   itself.  It can be accessed via _r_debug and dl_iterate_phdr
+   callbacks, and it is used by _dl_find_object.  */
+static void
+rtld_setup_phdr (void)
+{
+  /* Starting from binutils-2.23, the linker will define the magic
+     symbol __ehdr_start to point to our own ELF header if it is
+     visible in a segment that also includes the phdrs.  */
+
+  const ElfW(Ehdr) *rtld_ehdr = &__ehdr_start;
+  assert (rtld_ehdr->e_ehsize == sizeof *rtld_ehdr);
+  assert (rtld_ehdr->e_phentsize == sizeof (ElfW(Phdr)));
+
+  const ElfW(Phdr) *rtld_phdr = (const void *) rtld_ehdr + rtld_ehdr->e_phoff;
+
+  _dl_rtld_map.l_phdr = rtld_phdr;
+  _dl_rtld_map.l_phnum = rtld_ehdr->e_phnum;
+
+
+  _dl_rtld_map.l_contiguous = 1;
+  /* The linker may not have produced a contiguous object.  The kernel
+     will load the object with actual gaps (unlike the glibc loader
+     for shared objects, which always produces a contiguous mapping).
+     See similar logic in rtld_setup_main_map above.  */
+  {
+    ElfW(Addr) expected_load_address = 0;
+    for (const ElfW(Phdr) *ph = rtld_phdr; ph < &rtld_phdr[rtld_ehdr->e_phnum];
+	 ++ph)
+      if (ph->p_type == PT_LOAD)
+	{
+	  ElfW(Addr) mapstart = ph->p_vaddr & ~(GLRO(dl_pagesize) - 1);
+	  if (_dl_rtld_map.l_contiguous && expected_load_address != 0
+	      && expected_load_address != mapstart)
+	    _dl_rtld_map.l_contiguous = 0;
+	  ElfW(Addr) allocend = ph->p_vaddr + ph->p_memsz;
+	  /* The next expected address is the page following this load
+	     segment.  */
+	  expected_load_address = ((allocend + GLRO(dl_pagesize) - 1)
+				   & ~(GLRO(dl_pagesize) - 1));
+	}
+  }
+
+  /* PT_GNU_RELRO is usually the last phdr.  */
+  size_t cnt = rtld_ehdr->e_phnum;
+  while (cnt-- > 0)
+    if (rtld_phdr[cnt].p_type == PT_GNU_RELRO)
+      {
+	_dl_rtld_map.l_relro_addr = rtld_phdr[cnt].p_vaddr;
+	_dl_rtld_map.l_relro_size = rtld_phdr[cnt].p_memsz;
+	break;
+      }
+}
+
 /* Adjusts the contents of the stack and related globals for the user
    entry point.  The ld.so processed skip_args arguments and bumped
    _dl_argv and _dl_argc accordingly.  Those arguments are removed from
@@ -1626,9 +1680,9 @@ dl_main (const ElfW(Phdr) *phdr,
 
   bool has_interp = rtld_setup_main_map (main_map);
 
-  if ((__glibc_unlikely (GL(dl_stack_flags)) & PF_X)
-      && TUNABLE_GET (glibc, rtld, execstack, int32_t, NULL) == 0)
-    _dl_fatal_printf ("Fatal glibc error: executable stack is not allowed\n");
+  /* Handle this after PT_GNU_STACK parse, because it updates dl_stack_flags
+     if required.  */
+  _dl_handle_execstack_tunable ();
 
   /* If the current libname is different from the SONAME, add the
      latter as well.  */
@@ -1710,33 +1764,7 @@ dl_main (const ElfW(Phdr) *phdr,
   ++GL(dl_ns)[LM_ID_BASE]._ns_nloaded;
   ++GL(dl_load_adds);
 
-  /* Starting from binutils-2.23, the linker will define the magic symbol
-     __ehdr_start to point to our own ELF header if it is visible in a
-     segment that also includes the phdrs.  If that's not available, we use
-     the old method that assumes the beginning of the file is part of the
-     lowest-addressed PT_LOAD segment.  */
-
-  /* Set up the program header information for the dynamic linker
-     itself.  It is needed in the dl_iterate_phdr callbacks.  */
-  const ElfW(Ehdr) *rtld_ehdr = &__ehdr_start;
-  assert (rtld_ehdr->e_ehsize == sizeof *rtld_ehdr);
-  assert (rtld_ehdr->e_phentsize == sizeof (ElfW(Phdr)));
-
-  const ElfW(Phdr) *rtld_phdr = (const void *) rtld_ehdr + rtld_ehdr->e_phoff;
-
-  _dl_rtld_map.l_phdr = rtld_phdr;
-  _dl_rtld_map.l_phnum = rtld_ehdr->e_phnum;
-
-
-  /* PT_GNU_RELRO is usually the last phdr.  */
-  size_t cnt = rtld_ehdr->e_phnum;
-  while (cnt-- > 0)
-    if (rtld_phdr[cnt].p_type == PT_GNU_RELRO)
-      {
-	_dl_rtld_map.l_relro_addr = rtld_phdr[cnt].p_vaddr;
-	_dl_rtld_map.l_relro_size = rtld_phdr[cnt].p_memsz;
-	break;
-      }
+  rtld_setup_phdr ();
 
   /* Add the dynamic linker to the TLS list if it also uses TLS.  */
   if (_dl_rtld_map.l_tls_blocksize != 0)
diff --git a/elf/tst-audit-tlsdesc-dlopen2.c b/elf/tst-audit-tlsdesc-dlopen2.c
new file mode 100644
index 000000000..7ba2c4129
--- /dev/null
+++ b/elf/tst-audit-tlsdesc-dlopen2.c
@@ -0,0 +1,46 @@
+/* Loading TLS-using modules from auditors (bug 32412).  Main program.
+   Copyright (C) 2021-2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <support/xdlfcn.h>
+#include <stdio.h>
+
+static int
+do_test (void)
+{
+  puts ("info: start of main program");
+
+  /* Load TLS-using modules, to trigger DTV resizing.  The dynamic
+     linker will load them again (requiring their own TLS) because the
+     dlopen calls from the auditor were in the auditing namespace.  */
+  for (int i = 1; i <= 19; ++i)
+    {
+      char dso[30];
+      snprintf (dso, sizeof (dso), "tst-tlsmod17a%d.so", i);
+      char sym[30];
+      snprintf (sym, sizeof(sym), "tlsmod17a%d", i);
+
+      void *handle = xdlopen (dso, RTLD_LAZY);
+      int (*func) (void) = xdlsym (handle, sym);
+      /* Trigger TLS allocation.  */
+      func ();
+    }
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/elf/tst-auditmod-tlsdesc2.c b/elf/tst-auditmod-tlsdesc2.c
new file mode 100644
index 000000000..50275cd34
--- /dev/null
+++ b/elf/tst-auditmod-tlsdesc2.c
@@ -0,0 +1,59 @@
+/* Loading TLS-using modules from auditors (bug 32412).  Audit module.
+   Copyright (C) 2021-2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <dlfcn.h>
+#include <link.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <unistd.h>
+
+unsigned int
+la_version (unsigned int version)
+{
+  /* Open some modules, to trigger DTV resizing before the switch to
+     the main malloc.  */
+  for (int i = 1; i <= 19; ++i)
+    {
+      char dso[30];
+      snprintf (dso, sizeof (dso), "tst-tlsmod17a%d.so", i);
+      char sym[30];
+      snprintf (sym, sizeof(sym), "tlsmod17a%d", i);
+
+      void *handle = dlopen (dso, RTLD_LAZY);
+      if (handle == NULL)
+        {
+          printf ("error: dlmopen from auditor: %s\n", dlerror  ());
+          fflush (stdout);
+          _exit (1);
+        }
+      int (*func) (void) = dlsym (handle, sym);
+      if (func == NULL)
+        {
+          printf ("error: dlsym from auditor: %s\n", dlerror  ());
+          fflush (stdout);
+          _exit (1);
+        }
+      /* Trigger TLS allocation.  */
+      func ();
+    }
+
+  puts ("info: TLS-using modules loaded from auditor");
+  fflush (stdout);
+
+  return LAV_CURRENT;
+}
diff --git a/elf/tst-dlopen-sgid-mod.c b/elf/tst-dlopen-sgid-mod.c
new file mode 100644
index 000000000..5eb79eef4
--- /dev/null
+++ b/elf/tst-dlopen-sgid-mod.c
@@ -0,0 +1 @@
+/* Opening this object should not succeed.  */
diff --git a/elf/tst-dlopen-sgid.c b/elf/tst-dlopen-sgid.c
new file mode 100644
index 000000000..8aec52e19
--- /dev/null
+++ b/elf/tst-dlopen-sgid.c
@@ -0,0 +1,106 @@
+/* Test case for ignored LD_LIBRARY_PATH in static startug (bug 32976).
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <dlfcn.h>
+#include <gnu/lib-names.h>
+#include <stddef.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <string.h>
+#include <support/capture_subprocess.h>
+#include <support/check.h>
+#include <support/support.h>
+#include <support/temp_file.h>
+#include <support/test-driver.h>
+#include <sys/wait.h>
+#include <unistd.h>
+
+/* This is the name of our test object.  Use a custom module for
+   testing, so that this object does not get picked up from the system
+   path.  */
+static const char dso_name[] = "tst-dlopen-sgid-mod.so";
+
+/* Used to mark the recursive invocation.  */
+static const char magic_argument[] = "run-actual-test";
+
+static int
+do_test (void)
+{
+/* Pathname of the directory that receives the shared objects this
+   test attempts to load.  */
+  char *libdir = support_create_temp_directory ("tst-dlopen-sgid-");
+
+  /* This is supposed to be ignored and stripped.  */
+  TEST_COMPARE (setenv ("LD_LIBRARY_PATH", libdir, 1), 0);
+
+  /* Copy of libc.so.6.  */
+  {
+    char *from = xasprintf ("%s/%s", support_objdir_root, LIBC_SO);
+    char *to = xasprintf ("%s/%s", libdir, LIBC_SO);
+    add_temp_file (to);
+    support_copy_file (from, to);
+    free (to);
+    free (from);
+  }
+
+  /* Copy of the test object.   */
+  {
+    char *from = xasprintf ("%s/elf/%s", support_objdir_root, dso_name);
+    char *to = xasprintf ("%s/%s", libdir, dso_name);
+    add_temp_file (to);
+    support_copy_file (from, to);
+    free (to);
+    free (from);
+  }
+
+  free (libdir);
+
+  support_capture_subprogram_self_sgid (magic_argument);
+
+  return 0;
+}
+
+static void
+alternative_main (int argc, char **argv)
+{
+  if (argc == 2 && strcmp (argv[1], magic_argument) == 0)
+    {
+      if (getgid () == getegid ())
+        /* This can happen if the file system is mounted nosuid.  */
+        FAIL_UNSUPPORTED ("SGID failed: GID and EGID match (%jd)\n",
+                          (intmax_t) getgid ());
+
+      /* Should be removed due to SGID.  */
+      TEST_COMPARE_STRING (getenv ("LD_LIBRARY_PATH"), NULL);
+
+      TEST_VERIFY (dlopen (dso_name, RTLD_NOW) == NULL);
+      {
+        const char *message = dlerror ();
+        TEST_COMPARE_STRING (message,
+                             "tst-dlopen-sgid-mod.so:"
+                             " cannot open shared object file:"
+                             " No such file or directory");
+      }
+
+      support_record_failure_barrier ();
+      exit (EXIT_SUCCESS);
+    }
+}
+
+#define PREPARE alternative_main
+#include <support/test-driver.c>
diff --git a/elf/tst-env-setuid-tunables.c b/elf/tst-env-setuid-tunables.c
index a4233b172..bfdb30cbd 100644
--- a/elf/tst-env-setuid-tunables.c
+++ b/elf/tst-env-setuid-tunables.c
@@ -105,10 +105,7 @@ do_test (int argc, char **argv)
 
       if (ret != 0)
 	exit (1);
-
-      /* Special return code to make sure that the child executed all the way
-	 through.  */
-      exit (42);
+      return 0;
     }
   else
     {
@@ -127,18 +124,7 @@ do_test (int argc, char **argv)
 	      continue;
 	    }
 
-	  int status = support_capture_subprogram_self_sgid (buf);
-
-	  /* Bail out early if unsupported.  */
-	  if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
-	    return EXIT_UNSUPPORTED;
-
-	  if (WEXITSTATUS (status) != 42)
-	    {
-	      printf ("    [%d] child failed with status %d\n", i,
-		      WEXITSTATUS (status));
-	      support_record_failure ();
-	    }
+	  support_capture_subprogram_self_sgid (buf);
 	}
       return 0;
     }
diff --git a/elf/tst-env-setuid.c b/elf/tst-env-setuid.c
index 2c632ed30..7209acd61 100644
--- a/elf/tst-env-setuid.c
+++ b/elf/tst-env-setuid.c
@@ -147,10 +147,7 @@ do_test (int argc, char **argv)
 
       if (ret != 0)
 	exit (1);
-
-      /* Special return code to make sure that the child executed all the way
-	 through.  */
-      exit (42);
+      return 0;
     }
   else
     {
@@ -174,17 +171,7 @@ do_test (int argc, char **argv)
 	free (profilepath);
       }
 
-      int status = support_capture_subprogram_self_sgid (SETGID_CHILD);
-
-      if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
-	exit (EXIT_UNSUPPORTED);
-
-      if (WEXITSTATUS (status) != 42)
-	{
-	  printf ("    child failed with status %d\n",
-		  WEXITSTATUS (status));
-	  support_record_failure ();
-	}
+      support_capture_subprogram_self_sgid (SETGID_CHILD);
 
       return 0;
     }
diff --git a/elf/tst-execstack-prog-static-tunable.c b/elf/tst-execstack-prog-static-tunable.c
new file mode 100644
index 000000000..88b0ca126
--- /dev/null
+++ b/elf/tst-execstack-prog-static-tunable.c
@@ -0,0 +1 @@
+#include <tst-execstack-prog-static.c>
diff --git a/elf/tst-execstack-tunable.c b/elf/tst-execstack-tunable.c
new file mode 100644
index 000000000..9f03b0f7c
--- /dev/null
+++ b/elf/tst-execstack-tunable.c
@@ -0,0 +1 @@
+#include <tst-execstack.c>
diff --git a/elf/tst-link-map-contiguous-ldso.c b/elf/tst-link-map-contiguous-ldso.c
new file mode 100644
index 000000000..04de808bb
--- /dev/null
+++ b/elf/tst-link-map-contiguous-ldso.c
@@ -0,0 +1,98 @@
+/* Check that _dl_find_object behavior matches up with gaps.
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <dlfcn.h>
+#include <gnu/lib-names.h>
+#include <link.h>
+#include <stdbool.h>
+#include <stdio.h>
+#include <support/check.h>
+#include <support/xdlfcn.h>
+#include <support/xunistd.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+static int
+do_test (void)
+{
+  struct link_map *l = xdlopen (LD_SO, RTLD_NOW);
+  if (!l->l_contiguous)
+    {
+      puts ("info: ld.so link map is not contiguous");
+
+      /* Try to find holes by probing with mmap.  */
+      int pagesize = getpagesize ();
+      bool gap_found = false;
+      ElfW(Addr) addr = l->l_map_start;
+      TEST_COMPARE (addr % pagesize, 0);
+      while (addr < l->l_map_end)
+        {
+          void *expected = (void *) addr;
+          void *ptr = xmmap (expected, 1, PROT_READ | PROT_WRITE,
+                             MAP_PRIVATE | MAP_ANONYMOUS, -1);
+          struct dl_find_object dlfo;
+          int dlfo_ret = _dl_find_object (expected, &dlfo);
+          if (ptr == expected)
+            {
+              if (dlfo_ret < 0)
+                {
+                  TEST_COMPARE (dlfo_ret, -1);
+                  printf ("info: hole without mapping data found at %p\n", ptr);
+                }
+              else
+                FAIL ("object \"%s\" found in gap at %p",
+                      dlfo.dlfo_link_map->l_name, ptr);
+              gap_found = true;
+            }
+          else if (dlfo_ret == 0)
+            {
+              if ((void *) dlfo.dlfo_link_map != (void *) l)
+                {
+                  printf ("info: object \"%s\" found at %p\n",
+                          dlfo.dlfo_link_map->l_name, ptr);
+                  gap_found = true;
+                }
+            }
+          else
+            TEST_COMPARE (dlfo_ret, -1);
+          xmunmap (ptr, 1);
+          addr += pagesize;
+        }
+      if (!gap_found)
+        FAIL ("no ld.so gap found");
+    }
+  else
+    {
+      puts ("info: ld.so link map is contiguous");
+
+      /* Assert that ld.so is truly contiguous in memory.  */
+      volatile long int *p = (volatile long int *) l->l_map_start;
+      volatile long int *end = (volatile long int *) l->l_map_end;
+      while (p < end)
+        {
+          *p;
+          ++p;
+        }
+    }
+
+  xdlclose (l);
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/elf/tst-link-map-contiguous-libc.c b/elf/tst-link-map-contiguous-libc.c
new file mode 100644
index 000000000..eb5728c76
--- /dev/null
+++ b/elf/tst-link-map-contiguous-libc.c
@@ -0,0 +1,57 @@
+/* Check that the entire libc.so program image is readable if contiguous.
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <gnu/lib-names.h>
+#include <link.h>
+#include <support/check.h>
+#include <support/xdlfcn.h>
+#include <support/xunistd.h>
+#include <sys/mman.h>
+#include <unistd.h>
+
+static int
+do_test (void)
+{
+  struct link_map *l = xdlopen (LIBC_SO, RTLD_NOW);
+
+  /* The dynamic loader fills holes with PROT_NONE mappings.  */
+  if (!l->l_contiguous)
+    FAIL_EXIT1 ("libc.so link map is not contiguous");
+
+  /* Direct probing does not work because not everything is readable
+     due to PROT_NONE mappings.  */
+  int pagesize = getpagesize ();
+  ElfW(Addr) addr = l->l_map_start;
+  TEST_COMPARE (addr % pagesize, 0);
+  while (addr < l->l_map_end)
+    {
+      void *expected = (void *) addr;
+      void *ptr = xmmap (expected, 1, PROT_READ | PROT_WRITE,
+                         MAP_PRIVATE | MAP_ANONYMOUS, -1);
+      if (ptr == expected)
+        FAIL ("hole in libc.so memory image after %lu bytes",
+              (unsigned long int) (addr - l->l_map_start));
+      xmunmap (ptr, 1);
+      addr += pagesize;
+    }
+
+  xdlclose (l);
+
+  return 0;
+}
+#include <support/test-driver.c>
diff --git a/elf/tst-link-map-contiguous-main.c b/elf/tst-link-map-contiguous-main.c
new file mode 100644
index 000000000..2d1a054f0
--- /dev/null
+++ b/elf/tst-link-map-contiguous-main.c
@@ -0,0 +1,45 @@
+/* Check that the entire main program image is readable if contiguous.
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <link.h>
+#include <support/check.h>
+#include <support/xdlfcn.h>
+
+static int
+do_test (void)
+{
+  struct link_map *l = xdlopen ("", RTLD_NOW);
+  if (!l->l_contiguous)
+    FAIL_UNSUPPORTED ("main link map is not contiguous");
+
+  /* This check only works if the kernel loaded the main program.  The
+     dynamic loader replaces gaps with PROT_NONE mappings, resulting
+     in faults.  */
+  volatile long int *p = (volatile long int *) l->l_map_start;
+  volatile long int *end = (volatile long int *) l->l_map_end;
+  while (p < end)
+    {
+      *p;
+      ++p;
+    }
+
+  xdlclose (l);
+
+  return 0;
+}
+#include <support/test-driver.c>
diff --git a/elf/tst-pie-bss-static.c b/elf/tst-pie-bss-static.c
new file mode 100644
index 000000000..5df542f9e
--- /dev/null
+++ b/elf/tst-pie-bss-static.c
@@ -0,0 +1,19 @@
+/* Test static PIE with an empty PT_LOAD segment at offset 0.
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "tst-pie-bss.c"
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S b/elf/tst-pie-bss.c
similarity index 66%
rename from sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S
rename to elf/tst-pie-bss.c
index 7b45fcd63..ee9275424 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcmp-power10.S
+++ b/elf/tst-pie-bss.c
@@ -1,5 +1,5 @@
-/* Optimized strcmp implementation for POWER10/PPC64.
-   Copyright (C) 2021-2025 Free Software Foundation, Inc.
+/* Test PIE with an empty PT_LOAD segment at offset 0.
+   Copyright (C) 2025 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,11 +16,15 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
-#define STRCMP __strcmp_power10
+#include <stdio.h>
 
-#undef libc_hidden_builtin_def
-#define libc_hidden_builtin_def(name)
+char bss[0xb5dce8] __attribute__ ((aligned (65536)));
 
-#include <sysdeps/powerpc/powerpc64/le/power10/strcmp.S>
-#endif /* __LITTLE_ENDIAN__ && IS_IN (libc) */
+static int
+do_test (void)
+{
+  printf ("Hello\n");
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/elf/tst-rtld-list-tunables.exp b/elf/tst-rtld-list-tunables.exp
index 9f5990f34..8df6f5906 100644
--- a/elf/tst-rtld-list-tunables.exp
+++ b/elf/tst-rtld-list-tunables.exp
@@ -13,6 +13,6 @@ glibc.malloc.top_pad: 0x20000 (min: 0x0, max: 0x[f]+)
 glibc.malloc.trim_threshold: 0x0 (min: 0x0, max: 0x[f]+)
 glibc.rtld.dynamic_sort: 2 (min: 1, max: 2)
 glibc.rtld.enable_secure: 0 (min: 0, max: 1)
-glibc.rtld.execstack: 1 (min: 0, max: 1)
+glibc.rtld.execstack: 1 (min: 0, max: 2)
 glibc.rtld.nns: 0x4 (min: 0x1, max: 0x10)
 glibc.rtld.optional_static_tls: 0x200 (min: 0x0, max: 0x[f]+)
diff --git a/iconv/iconv_prog.c b/iconv/iconv_prog.c
index 7dba5d8df..558cfb11a 100644
--- a/iconv/iconv_prog.c
+++ b/iconv/iconv_prog.c
@@ -436,7 +436,7 @@ input_error (const char *path)
 static void
 open_output_direct (void)
 {
-  output_fd = open64 (output_file, O_WRONLY | O_CREAT | O_TRUNC, 0777);
+  output_fd = open64 (output_file, O_WRONLY | O_CREAT | O_TRUNC, 0666);
   if (output_fd < 0)
     output_error ();
 }
@@ -457,7 +457,7 @@ prepare_output_file (char **argv)
   else
     {
       /* If iconv creates the output file, no overlap is possible.  */
-      output_fd = open64 (output_file, O_WRONLY | O_CREAT | O_EXCL, 0777);
+      output_fd = open64 (output_file, O_WRONLY | O_CREAT | O_EXCL, 0666);
       if (output_fd >= 0)
 	output_buffer_size = copy_buffer_size;
       else
diff --git a/iconv/tst-iconv_prog-buffer.sh b/iconv/tst-iconv_prog-buffer.sh
index 1c499d590..40340c38f 100644
--- a/iconv/tst-iconv_prog-buffer.sh
+++ b/iconv/tst-iconv_prog-buffer.sh
@@ -75,6 +75,10 @@ run_iconv () {
 }
 
 check_out_expected () {
+    if test -x "$tmp/out" ; then
+	echo "error: iconv output file is executable"
+	failure=true
+    fi
     if ! cmp -s "$tmp/out" "$tmp/expected" ; then
         echo "error: iconv output difference" >&$logfd
         echo "*** expected ***" >&$logfd
diff --git a/include/dlfcn.h b/include/dlfcn.h
index f49ee1b0c..a44420fa3 100644
--- a/include/dlfcn.h
+++ b/include/dlfcn.h
@@ -4,8 +4,7 @@
 #include <link.h>		/* For ElfW.  */
 #include <stdbool.h>
 
-extern __typeof (_dl_find_object) __dl_find_object;
-hidden_proto (__dl_find_object)
+rtld_hidden_proto (_dl_find_object)
 
 /* Internally used flag.  */
 #define __RTLD_DLOPEN	0x80000000
diff --git a/math/auto-libm-test-in b/math/auto-libm-test-in
index 01ba689aa..4f194da19 100644
--- a/math/auto-libm-test-in
+++ b/math/auto-libm-test-in
@@ -7291,6 +7291,8 @@ log10p1 -0x1p-125
 log10p1 -0x1p-1021
 log10p1 -0x1p-16381
 
+log10p1 0x1.27f7dap-17
+
 log10p1 0x7.2a4368p-4
 log10p1 0x6.d3a118p-4
 log10p1 0x5.03f228p+0
@@ -8298,6 +8300,7 @@ sinh -0x1.3dda8ap+0
 sinh -0x5.ee9218p-4
 sinh -0x1.bcfc98p+0
 sinh -0x6.9bbb6df7c5d08p-4
+sinh 0x1.250bfep-11
 # the next value generates larger error bounds on x86_64 (ldbl-96)
 sinh 0x2.c5d376167f4052f4p+12
 sinh max
@@ -8661,6 +8664,7 @@ tan 0x1.1ad374p+0
 tan -0x1.0d55b8p+0
 tan 1.57079697
 tan -1.57079697
+tan 0x1.ada6aap+27
 tan 0x1p-5
 tan 0x1p-10
 tan 0x1p-15
diff --git a/math/auto-libm-test-out-log10p1 b/math/auto-libm-test-out-log10p1
index 87bdb0bcd..f5ce96572 100644
--- a/math/auto-libm-test-out-log10p1
+++ b/math/auto-libm-test-out-log10p1
@@ -1789,6 +1789,31 @@ log10p1 -0x1p-16381
 = log10p1 tonearest binary128 -0x8p-16384 : -0x3.796f62a4dca1c654d56eaabeb4dp-16384 : inexact-ok underflow errno-erange-ok
 = log10p1 towardzero binary128 -0x8p-16384 : -0x3.796f62a4dca1c654d56eaabeb4ccp-16384 : inexact-ok underflow errno-erange-ok
 = log10p1 upward binary128 -0x8p-16384 : -0x3.796f62a4dca1c654d56eaabeb4ccp-16384 : inexact-ok underflow errno-erange-ok
+log10p1 0x1.27f7dap-17
+= log10p1 downward binary32 0x9.3fbedp-20 : 0x4.044b5p-20 : inexact-ok
+= log10p1 tonearest binary32 0x9.3fbedp-20 : 0x4.044b5p-20 : inexact-ok
+= log10p1 towardzero binary32 0x9.3fbedp-20 : 0x4.044b5p-20 : inexact-ok
+= log10p1 upward binary32 0x9.3fbedp-20 : 0x4.044b58p-20 : inexact-ok
+= log10p1 downward binary64 0x9.3fbedp-20 : 0x4.044b5157872ep-20 : inexact-ok
+= log10p1 tonearest binary64 0x9.3fbedp-20 : 0x4.044b5157872e4p-20 : inexact-ok
+= log10p1 towardzero binary64 0x9.3fbedp-20 : 0x4.044b5157872ep-20 : inexact-ok
+= log10p1 upward binary64 0x9.3fbedp-20 : 0x4.044b5157872e4p-20 : inexact-ok
+= log10p1 downward intel96 0x9.3fbedp-20 : 0x4.044b5157872e2868p-20 : inexact-ok
+= log10p1 tonearest intel96 0x9.3fbedp-20 : 0x4.044b5157872e2868p-20 : inexact-ok
+= log10p1 towardzero intel96 0x9.3fbedp-20 : 0x4.044b5157872e2868p-20 : inexact-ok
+= log10p1 upward intel96 0x9.3fbedp-20 : 0x4.044b5157872e287p-20 : inexact-ok
+= log10p1 downward m68k96 0x9.3fbedp-20 : 0x4.044b5157872e2868p-20 : inexact-ok
+= log10p1 tonearest m68k96 0x9.3fbedp-20 : 0x4.044b5157872e2868p-20 : inexact-ok
+= log10p1 towardzero m68k96 0x9.3fbedp-20 : 0x4.044b5157872e2868p-20 : inexact-ok
+= log10p1 upward m68k96 0x9.3fbedp-20 : 0x4.044b5157872e287p-20 : inexact-ok
+= log10p1 downward binary128 0x9.3fbedp-20 : 0x4.044b5157872e2868f5c04287d808p-20 : inexact-ok
+= log10p1 tonearest binary128 0x9.3fbedp-20 : 0x4.044b5157872e2868f5c04287d80cp-20 : inexact-ok
+= log10p1 towardzero binary128 0x9.3fbedp-20 : 0x4.044b5157872e2868f5c04287d808p-20 : inexact-ok
+= log10p1 upward binary128 0x9.3fbedp-20 : 0x4.044b5157872e2868f5c04287d80cp-20 : inexact-ok
+= log10p1 downward ibm128 0x9.3fbedp-20 : 0x4.044b5157872e2868f5c04287d8p-20 : inexact-ok
+= log10p1 tonearest ibm128 0x9.3fbedp-20 : 0x4.044b5157872e2868f5c04287d8p-20 : inexact-ok
+= log10p1 towardzero ibm128 0x9.3fbedp-20 : 0x4.044b5157872e2868f5c04287d8p-20 : inexact-ok
+= log10p1 upward ibm128 0x9.3fbedp-20 : 0x4.044b5157872e2868f5c04287dap-20 : inexact-ok
 log10p1 0x7.2a4368p-4
 = log10p1 downward binary32 0x7.2a4368p-4 : 0x2.9248dcp-4 : inexact-ok
 = log10p1 tonearest binary32 0x7.2a4368p-4 : 0x2.9248ep-4 : inexact-ok
diff --git a/math/auto-libm-test-out-sinh b/math/auto-libm-test-out-sinh
index 0b77a77ee..3924e19d8 100644
--- a/math/auto-libm-test-out-sinh
+++ b/math/auto-libm-test-out-sinh
@@ -2115,6 +2115,31 @@ sinh -0x6.9bbb6df7c5d08p-4
 = sinh tonearest ibm128 -0x6.9bbb6df7c5d08p-4 : -0x6.cc3ddf003dcda77f8f9e892e36p-4 : inexact-ok
 = sinh towardzero ibm128 -0x6.9bbb6df7c5d08p-4 : -0x6.cc3ddf003dcda77f8f9e892e36p-4 : inexact-ok
 = sinh upward ibm128 -0x6.9bbb6df7c5d08p-4 : -0x6.cc3ddf003dcda77f8f9e892e36p-4 : inexact-ok
+sinh 0x1.250bfep-11
+= sinh downward binary32 0x2.4a17fcp-12 : 0x2.4a17fcp-12 : inexact-ok
+= sinh tonearest binary32 0x2.4a17fcp-12 : 0x2.4a17fcp-12 : inexact-ok
+= sinh towardzero binary32 0x2.4a17fcp-12 : 0x2.4a17fcp-12 : inexact-ok
+= sinh upward binary32 0x2.4a17fcp-12 : 0x2.4a18p-12 : inexact-ok
+= sinh downward binary64 0x2.4a17fcp-12 : 0x2.4a17fdffffffep-12 : inexact-ok
+= sinh tonearest binary64 0x2.4a17fcp-12 : 0x2.4a17fep-12 : inexact-ok
+= sinh towardzero binary64 0x2.4a17fcp-12 : 0x2.4a17fdffffffep-12 : inexact-ok
+= sinh upward binary64 0x2.4a17fcp-12 : 0x2.4a17fep-12 : inexact-ok
+= sinh downward intel96 0x2.4a17fcp-12 : 0x2.4a17fdfffffff87cp-12 : inexact-ok
+= sinh tonearest intel96 0x2.4a17fcp-12 : 0x2.4a17fdfffffff88p-12 : inexact-ok
+= sinh towardzero intel96 0x2.4a17fcp-12 : 0x2.4a17fdfffffff87cp-12 : inexact-ok
+= sinh upward intel96 0x2.4a17fcp-12 : 0x2.4a17fdfffffff88p-12 : inexact-ok
+= sinh downward m68k96 0x2.4a17fcp-12 : 0x2.4a17fdfffffff87cp-12 : inexact-ok
+= sinh tonearest m68k96 0x2.4a17fcp-12 : 0x2.4a17fdfffffff88p-12 : inexact-ok
+= sinh towardzero m68k96 0x2.4a17fcp-12 : 0x2.4a17fdfffffff87cp-12 : inexact-ok
+= sinh upward m68k96 0x2.4a17fcp-12 : 0x2.4a17fdfffffff88p-12 : inexact-ok
+= sinh downward binary128 0x2.4a17fcp-12 : 0x2.4a17fdfffffff87e8d322786ec88p-12 : inexact-ok
+= sinh tonearest binary128 0x2.4a17fcp-12 : 0x2.4a17fdfffffff87e8d322786ec8ap-12 : inexact-ok
+= sinh towardzero binary128 0x2.4a17fcp-12 : 0x2.4a17fdfffffff87e8d322786ec88p-12 : inexact-ok
+= sinh upward binary128 0x2.4a17fcp-12 : 0x2.4a17fdfffffff87e8d322786ec8ap-12 : inexact-ok
+= sinh downward ibm128 0x2.4a17fcp-12 : 0x2.4a17fdfffffff87e8d322786ecp-12 : inexact-ok
+= sinh tonearest ibm128 0x2.4a17fcp-12 : 0x2.4a17fdfffffff87e8d322786edp-12 : inexact-ok
+= sinh towardzero ibm128 0x2.4a17fcp-12 : 0x2.4a17fdfffffff87e8d322786ecp-12 : inexact-ok
+= sinh upward ibm128 0x2.4a17fcp-12 : 0x2.4a17fdfffffff87e8d322786edp-12 : inexact-ok
 sinh 0x2.c5d376167f4052f4p+12
 = sinh downward binary32 0x2.c5d378p+12 : 0xf.fffffp+124 : inexact-ok overflow errno-erange-ok
 = sinh tonearest binary32 0x2.c5d378p+12 : plus_infty : inexact-ok overflow errno-erange
diff --git a/math/auto-libm-test-out-tan b/math/auto-libm-test-out-tan
index 7d00d03e1..1d5999ab9 100644
--- a/math/auto-libm-test-out-tan
+++ b/math/auto-libm-test-out-tan
@@ -2532,6 +2532,31 @@ tan -1.57079697
 = tan tonearest ibm128 -0x1.921fc00ece4f02f278ade6ad9fp+0 : 0x1.7b91a0851bbbafa14cf21c2b5c8p+20 : inexact-ok
 = tan towardzero ibm128 -0x1.921fc00ece4f02f278ade6ad9fp+0 : 0x1.7b91a0851bbbafa14cf21c2b5cp+20 : inexact-ok
 = tan upward ibm128 -0x1.921fc00ece4f02f278ade6ad9fp+0 : 0x1.7b91a0851bbbafa14cf21c2b5c8p+20 : inexact-ok
+tan 0x1.ada6aap+27
+= tan downward binary32 0xd.6d355p+24 : 0x3.d00608p-4 : inexact-ok
+= tan tonearest binary32 0xd.6d355p+24 : 0x3.d00608p-4 : inexact-ok
+= tan towardzero binary32 0xd.6d355p+24 : 0x3.d00608p-4 : inexact-ok
+= tan upward binary32 0xd.6d355p+24 : 0x3.d0060cp-4 : inexact-ok
+= tan downward binary64 0xd.6d355p+24 : 0x3.d00608p-4 : inexact-ok
+= tan tonearest binary64 0xd.6d355p+24 : 0x3.d00608p-4 : inexact-ok
+= tan towardzero binary64 0xd.6d355p+24 : 0x3.d00608p-4 : inexact-ok
+= tan upward binary64 0xd.6d355p+24 : 0x3.d006080000002p-4 : inexact-ok
+= tan downward intel96 0xd.6d355p+24 : 0x3.d006080000000504p-4 : inexact-ok
+= tan tonearest intel96 0xd.6d355p+24 : 0x3.d006080000000508p-4 : inexact-ok
+= tan towardzero intel96 0xd.6d355p+24 : 0x3.d006080000000504p-4 : inexact-ok
+= tan upward intel96 0xd.6d355p+24 : 0x3.d006080000000508p-4 : inexact-ok
+= tan downward m68k96 0xd.6d355p+24 : 0x3.d006080000000504p-4 : inexact-ok
+= tan tonearest m68k96 0xd.6d355p+24 : 0x3.d006080000000508p-4 : inexact-ok
+= tan towardzero m68k96 0xd.6d355p+24 : 0x3.d006080000000504p-4 : inexact-ok
+= tan upward m68k96 0xd.6d355p+24 : 0x3.d006080000000508p-4 : inexact-ok
+= tan downward binary128 0xd.6d355p+24 : 0x3.d0060800000005067d16c1c9c15ap-4 : inexact-ok
+= tan tonearest binary128 0xd.6d355p+24 : 0x3.d0060800000005067d16c1c9c15ap-4 : inexact-ok
+= tan towardzero binary128 0xd.6d355p+24 : 0x3.d0060800000005067d16c1c9c15ap-4 : inexact-ok
+= tan upward binary128 0xd.6d355p+24 : 0x3.d0060800000005067d16c1c9c15cp-4 : inexact-ok
+= tan downward ibm128 0xd.6d355p+24 : 0x3.d0060800000005067d16c1c9c1p-4 : inexact-ok
+= tan tonearest ibm128 0xd.6d355p+24 : 0x3.d0060800000005067d16c1c9c1p-4 : inexact-ok
+= tan towardzero ibm128 0xd.6d355p+24 : 0x3.d0060800000005067d16c1c9c1p-4 : inexact-ok
+= tan upward ibm128 0xd.6d355p+24 : 0x3.d0060800000005067d16c1c9c2p-4 : inexact-ok
 tan 0x1p-5
 = tan downward binary32 0x8p-8 : 0x8.00aabp-8 : inexact-ok
 = tan tonearest binary32 0x8p-8 : 0x8.00aacp-8 : inexact-ok
diff --git a/math/bits/mathcalls-macros.h b/math/bits/mathcalls-macros.h
index 1ef07f1f5..321ae00ec 100644
--- a/math/bits/mathcalls-macros.h
+++ b/math/bits/mathcalls-macros.h
@@ -34,7 +34,7 @@
 #define __MATHCALLX(function,suffix, args, attrib)	\
   __MATHDECLX (_Mdouble_,function,suffix, args, attrib)
 #define __MATHDECLX(type, function,suffix, args, attrib) \
-  __MATHDECL_1(type, function,suffix, args) __attribute__ (attrib);
+  __MATHDECL_1(type, function,suffix, args) __attribute__ (attrib)
 #define __MATHDECL_1_IMPL(type, function, suffix, args) \
   extern type __MATH_PRECNAME(function,suffix) args __THROW
 #define __MATHDECL_1(type, function, suffix, args) \
diff --git a/nptl/Makefile b/nptl/Makefile
index 82621c795..4be778ad6 100644
--- a/nptl/Makefile
+++ b/nptl/Makefile
@@ -701,6 +701,9 @@ $(objpfx)tst-execstack-threads.out: $(objpfx)tst-execstack-threads-mod.so
 LDFLAGS-tst-execstack-threads = -Wl,-z,noexecstack
 LDFLAGS-tst-execstack-threads-mod.so = -Wl,-z,execstack
 CFLAGS-tst-execstack-threads-mod.c += -Wno-trampolines
+ifeq ($(have-no-error-execstack),yes)
+LDFLAGS-tst-execstack-threads-mod.so += -Wl,--no-error-execstack
+endif
 
 tst-stackguard1-ARGS = --command "$(host-test-program-cmd) --child"
 tst-stackguard1-static-ARGS = --command "$(objpfx)tst-stackguard1-static --child"
diff --git a/nptl/cancellation.c b/nptl/cancellation.c
index 156e63dcf..bed0383a2 100644
--- a/nptl/cancellation.c
+++ b/nptl/cancellation.c
@@ -72,8 +72,8 @@ __syscall_cancel (__syscall_arg_t a1, __syscall_arg_t a2,
 		  __syscall_arg_t a5, __syscall_arg_t a6,
 		  __SYSCALL_CANCEL7_ARG_DEF __syscall_arg_t nr)
 {
-  int r = __internal_syscall_cancel (a1, a2, a3, a4, a5, a6,
-				     __SYSCALL_CANCEL7_ARG nr);
+  long int r = __internal_syscall_cancel (a1, a2, a3, a4, a5, a6,
+					  __SYSCALL_CANCEL7_ARG nr);
   return __glibc_unlikely (INTERNAL_SYSCALL_ERROR_P (r))
 	 ? SYSCALL_ERROR_LABEL (INTERNAL_SYSCALL_ERRNO (r))
 	 : r;
diff --git a/nptl/pthread_cancel.c b/nptl/pthread_cancel.c
index f7ce3ec51..b83827388 100644
--- a/nptl/pthread_cancel.c
+++ b/nptl/pthread_cancel.c
@@ -41,15 +41,17 @@ sigcancel_handler (int sig, siginfo_t *si, void *ctx)
       || si->si_code != SI_TKILL)
     return;
 
-  /* Check if asynchronous cancellation mode is set or if interrupted
-     instruction pointer falls within the cancellable syscall bridge.  For
-     interruptable syscalls with external side-effects (i.e. partial reads),
-     the kernel  will set the IP to after __syscall_cancel_arch_end, thus
-     disabling the cancellation and allowing the process to handle such
+  /* Check if asynchronous cancellation mode is set and cancellation is not
+     already in progress, or if interrupted instruction pointer falls within
+     the cancellable syscall bridge.
+     For interruptable syscalls with external side-effects (i.e. partial
+     reads), the kernel will set the IP to after __syscall_cancel_arch_end,
+     thus disabling the cancellation and allowing the process to handle such
      conditions.  */
   struct pthread *self = THREAD_SELF;
   int oldval = atomic_load_relaxed (&self->cancelhandling);
-  if (cancel_async_enabled (oldval) || cancellation_pc_check (ctx))
+  if (cancel_enabled_and_canceled_and_async (oldval)
+      || cancellation_pc_check (ctx))
     __syscall_do_cancel ();
 }
 
diff --git a/nptl/pthread_getattr_np.c b/nptl/pthread_getattr_np.c
index e98e2df15..43dd16d59 100644
--- a/nptl/pthread_getattr_np.c
+++ b/nptl/pthread_getattr_np.c
@@ -145,9 +145,9 @@ __pthread_getattr_np (pthread_t thread_id, pthread_attr_t *attr)
 			  > (size_t) iattr->stackaddr - last_to)
 			iattr->stacksize = (size_t) iattr->stackaddr - last_to;
 #else
-		      /* The limit might be too high.  */
+		      /* The limit might be too low.  */
 		      if ((size_t) iattr->stacksize
-			  > to - (size_t) iattr->stackaddr)
+			  < to - (size_t) iattr->stackaddr)
 			iattr->stacksize = to - (size_t) iattr->stackaddr;
 #endif
 		      /* We succeed and no need to look further.  */
diff --git a/posix/Makefile b/posix/Makefile
index a650abf59..0e209a7ed 100644
--- a/posix/Makefile
+++ b/posix/Makefile
@@ -303,6 +303,7 @@ tests := \
   tst-posix_spawn-setsid \
   tst-preadwrite \
   tst-preadwrite64 \
+  tst-regcomp-bracket-free \
   tst-regcomp-truncated \
   tst-regex \
   tst-regex2 \
diff --git a/posix/environ.c b/posix/environ.c
index a0ed0d80e..924effe3c 100644
--- a/posix/environ.c
+++ b/posix/environ.c
@@ -2,6 +2,7 @@
 
 #include <unistd.h>
 #include <stddef.h>
+#include <stdlib/setenv.h>
 
 /* This must be initialized; we cannot have a weak alias into bss.  */
 char **__environ = NULL;
@@ -10,3 +11,6 @@ weak_alias (__environ, environ)
 /* The SVR4 ABI says `_environ' will be the name to use
    in case the user overrides the weak alias `environ'.  */
 weak_alias (__environ, _environ)
+
+struct environ_array *__environ_array_list;
+environ_counter __environ_counter;
diff --git a/posix/regcomp.c b/posix/regcomp.c
index 69675d81f..5c486cee5 100644
--- a/posix/regcomp.c
+++ b/posix/regcomp.c
@@ -3384,6 +3384,7 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
     {
 #ifdef RE_ENABLE_I18N
       free_charset (mbcset);
+      mbcset = NULL;
 #endif
       /* Build a tree for simple bracket.  */
       br_token.type = SIMPLE_BRACKET;
@@ -3399,7 +3400,8 @@ parse_bracket_exp (re_string_t *regexp, re_dfa_t *dfa, re_token_t *token,
  parse_bracket_exp_free_return:
   re_free (sbcset);
 #ifdef RE_ENABLE_I18N
-  free_charset (mbcset);
+  if (__glibc_likely (mbcset != NULL))
+    free_charset (mbcset);
 #endif /* RE_ENABLE_I18N */
   return NULL;
 }
diff --git a/posix/tst-regcomp-bracket-free.c b/posix/tst-regcomp-bracket-free.c
new file mode 100644
index 000000000..3c091d8c4
--- /dev/null
+++ b/posix/tst-regcomp-bracket-free.c
@@ -0,0 +1,176 @@
+/* Test regcomp bracket parsing with injected allocation failures (bug 33185).
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* This test invokes regcomp multiple times, failing one memory
+   allocation in each call.  The function call should fail with
+   REG_ESPACE (or succeed if it can recover from the allocation
+   failure).  Previously, there was double-free bug.  */
+
+#include <errno.h>
+#include <regex.h>
+#include <stdio.h>
+#include <string.h>
+#include <support/check.h>
+#include <support/namespace.h>
+#include <support/support.h>
+
+/* Data structure allocated via MAP_SHARED, so that writes from the
+   subprocess are visible.  */
+struct shared_data
+{
+  /* Number of tracked allocations performed so far.  */
+  volatile unsigned int allocation_count;
+
+  /* If this number is reached, one allocation fails.  */
+  volatile unsigned int failing_allocation;
+
+  /* The subprocess stores the expected name here.  */
+  char name[100];
+};
+
+/* Allocation count in shared mapping.  */
+static struct shared_data *shared;
+
+/* Returns true if a failure should be injected for this allocation.  */
+static bool
+fail_this_allocation (void)
+{
+  if (shared != NULL)
+    {
+      unsigned int count = shared->allocation_count;
+      shared->allocation_count = count + 1;
+      return count == shared->failing_allocation;
+    }
+  else
+    return false;
+}
+
+/* Failure-injecting wrappers for allocation functions used by glibc.  */
+
+void *
+malloc (size_t size)
+{
+  if (fail_this_allocation ())
+    {
+      errno = ENOMEM;
+      return NULL;
+    }
+  extern __typeof (malloc) __libc_malloc;
+  return __libc_malloc (size);
+}
+
+void *
+calloc (size_t a, size_t b)
+{
+  if (fail_this_allocation ())
+    {
+      errno = ENOMEM;
+      return NULL;
+    }
+  extern __typeof (calloc) __libc_calloc;
+  return __libc_calloc (a, b);
+}
+
+void *
+realloc (void *ptr, size_t size)
+{
+  if (fail_this_allocation ())
+    {
+      errno = ENOMEM;
+      return NULL;
+    }
+  extern __typeof (realloc) __libc_realloc;
+  return __libc_realloc (ptr, size);
+}
+
+/* No-op subprocess to verify that support_isolate_in_subprocess does
+   not perform any heap allocations.  */
+static void
+no_op (void *ignored)
+{
+}
+
+/* Perform a regcomp call in a subprocess.  Used to count its
+   allocations.  */
+static void
+initialize (void *regexp1)
+{
+  const char *regexp = regexp1;
+
+  shared->allocation_count = 0;
+
+  regex_t reg;
+  TEST_COMPARE (regcomp (&reg, regexp, 0), 0);
+}
+
+/* Perform regcomp in a subprocess with fault injection.  */
+static void
+test_in_subprocess (void *regexp1)
+{
+  const char *regexp = regexp1;
+  unsigned int inject_at = shared->failing_allocation;
+
+  regex_t reg;
+  int ret = regcomp (&reg, regexp, 0);
+
+  if (ret != 0)
+    {
+      TEST_COMPARE (ret, REG_ESPACE);
+      printf ("info: allocation %u failure results in return value %d,"
+              " error %s (%d)\n",
+              inject_at, ret, strerrorname_np (errno), errno);
+    }
+}
+
+static int
+do_test (void)
+{
+  char regexp[] = "[:alpha:]";
+
+  shared = support_shared_allocate (sizeof (*shared));
+
+  /* Disable fault injection.  */
+  shared->failing_allocation = ~0U;
+
+  support_isolate_in_subprocess (no_op, NULL);
+  TEST_COMPARE (shared->allocation_count, 0);
+
+  support_isolate_in_subprocess (initialize, regexp);
+
+  /* The number of allocations in the successful case, plus some
+     slack.  Once the number of expected allocations is exceeded,
+     injecting further failures does not make a difference.  */
+  unsigned int maximum_allocation_count = shared->allocation_count;
+  printf ("info: successful call performs %u allocations\n",
+          maximum_allocation_count);
+  maximum_allocation_count += 10;
+
+  for (unsigned int inject_at = 0; inject_at <= maximum_allocation_count;
+       ++inject_at)
+    {
+      shared->allocation_count = 0;
+      shared->failing_allocation = inject_at;
+      support_isolate_in_subprocess (test_in_subprocess, regexp);
+    }
+
+  support_shared_free (shared);
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/stdlib/Makefile b/stdlib/Makefile
index 1c4fa2382..c9c8f702a 100644
--- a/stdlib/Makefile
+++ b/stdlib/Makefile
@@ -282,6 +282,7 @@ tests := \
   tst-environ-change-3 \
   tst-environ-change-4 \
   tst-getenv-signal \
+  tst-getenv-static \
   tst-getenv-thread \
   tst-getenv-unsetenv \
   tst-getrandom \
@@ -377,6 +378,7 @@ tests-internal := \
   # tests-internal
 
 tests-static := \
+  tst-getenv-static \
   tst-secure-getenv \
   # tests-static
 
diff --git a/stdlib/abort.c b/stdlib/abort.c
index caa9e6dc0..904244a2f 100644
--- a/stdlib/abort.c
+++ b/stdlib/abort.c
@@ -19,6 +19,7 @@
 #include <internal-signals.h>
 #include <libc-lock.h>
 #include <pthreadP.h>
+#include <string.h>
 #include <unistd.h>
 
 /* Try to get a machine dependent instruction which will make the
@@ -42,7 +43,10 @@ __libc_rwlock_define_initialized (static, lock);
 void
 __abort_fork_reset_child (void)
 {
-  __libc_rwlock_init (lock);
+  /* Reinitialize lock without calling pthread_rwlock_init, to
+     avoid a valgrind DRD false positive.  */
+  __libc_rwlock_define_initialized (, reset_lock);
+  memcpy (&lock, &reset_lock, sizeof (lock));
 }
 
 void
diff --git a/stdlib/getenv.c b/stdlib/getenv.c
index 5e7212cca..1a7b0bfc0 100644
--- a/stdlib/getenv.c
+++ b/stdlib/getenv.c
@@ -20,9 +20,6 @@
 #include <string.h>
 #include <unistd.h>
 
-struct environ_array *__environ_array_list;
-environ_counter __environ_counter;
-
 char *
 getenv (const char *name)
 {
diff --git a/stdlib/tst-getenv-static.c b/stdlib/tst-getenv-static.c
new file mode 100644
index 000000000..f5f484c83
--- /dev/null
+++ b/stdlib/tst-getenv-static.c
@@ -0,0 +1,38 @@
+/* Static interposition of getenv (bug 32541).
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <stdlib.h>
+#include <support/check.h>
+
+/* Some programs try to interpose getenv for their own use (not
+   glibc's internal use).  Make sure that this is possible without
+   introducing linker failures due to duplicate symbols.  */
+
+char *
+getenv (const char *ignored)
+{
+  return NULL;
+}
+
+static int
+do_test (void)
+{
+  TEST_COMPARE_STRING (getenv ("PATH"), NULL);
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/stdlib/tst-secure-getenv.c b/stdlib/tst-secure-getenv.c
index 3fd1d232b..c12c63aee 100644
--- a/stdlib/tst-secure-getenv.c
+++ b/stdlib/tst-secure-getenv.c
@@ -57,13 +57,7 @@ do_test (void)
       exit (1);
     }
 
-  int status = support_capture_subprogram_self_sgid (MAGIC_ARGUMENT);
-
-  if (WEXITSTATUS (status) == EXIT_UNSUPPORTED)
-    return EXIT_UNSUPPORTED;
-
-  if (!WIFEXITED (status))
-    FAIL_EXIT1 ("Unexpected exit status %d from child process\n", status);
+  support_capture_subprogram_self_sgid (MAGIC_ARGUMENT);
 
   return 0;
 }
@@ -82,6 +76,7 @@ alternative_main (int argc, char **argv)
       if (secure_getenv ("PATH") != NULL)
 	FAIL_EXIT (4, "PATH variable not filtered out\n");
 
+      support_record_failure_barrier ();
       exit (EXIT_SUCCESS);
     }
 }
diff --git a/support/capture_subprocess.h b/support/capture_subprocess.h
index 91d75e5d6..b37462d0d 100644
--- a/support/capture_subprocess.h
+++ b/support/capture_subprocess.h
@@ -42,11 +42,12 @@ struct support_capture_subprocess support_capture_subprocess
 struct support_capture_subprocess support_capture_subprogram
   (const char *file, char *const argv[], char *const envp[]);
 
-/* Copy the running program into a setgid binary and run it with CHILD_ID
-   argument.  If execution is successful, return the exit status of the child
-   program, otherwise return a non-zero failure exit code.  */
-int support_capture_subprogram_self_sgid
-  (char *child_id);
+/* Copy the running program into a setgid binary and run it with
+   CHILD_ID argument.  If the program exits with a non-zero status,
+   exit with that exit status (or status 1 if the program did not exit
+   normally).  If the test cannot be performed, exit with
+   EXIT_UNSUPPORTED.  */
+void support_capture_subprogram_self_sgid (const char *child_id);
 
 /* Deallocate the subprocess data captured by
    support_capture_subprocess.  */
diff --git a/support/support_capture_subprocess.c b/support/support_capture_subprocess.c
index c3ef478d1..b4e4bf950 100644
--- a/support/support_capture_subprocess.c
+++ b/support/support_capture_subprocess.c
@@ -21,12 +21,17 @@
 
 #include <errno.h>
 #include <fcntl.h>
+#include <grp.h>
+#include <scratch_buffer.h>
+#include <stdio_ext.h>
 #include <stdlib.h>
+#include <string.h>
 #include <support/check.h>
 #include <support/xunistd.h>
 #include <support/xsocket.h>
 #include <support/xspawn.h>
 #include <support/support.h>
+#include <support/temp_file.h>
 #include <support/test-driver.h>
 
 static void
@@ -109,111 +114,88 @@ support_capture_subprogram (const char *file, char *const argv[],
 /* Copies the executable into a restricted directory, so that we can
    safely make it SGID with the TARGET group ID.  Then runs the
    executable.  */
-static int
-copy_and_spawn_sgid (char *child_id, gid_t gid)
+static void
+copy_and_spawn_sgid (const char *child_id, gid_t gid)
 {
-  char *dirname = xasprintf ("%s/tst-tunables-setuid.%jd",
-			     test_dir, (intmax_t) getpid ());
+  char *dirname = support_create_temp_directory ("tst-glibc-sgid-");
   char *execname = xasprintf ("%s/bin", dirname);
-  int infd = -1;
-  int outfd = -1;
-  int ret = 1, status = 1;
-
-  TEST_VERIFY (mkdir (dirname, 0700) == 0);
-  if (support_record_failure_is_failed ())
-    goto err;
+  add_temp_file (execname);
 
-  infd = open ("/proc/self/exe", O_RDONLY);
-  if (infd < 0)
+  if (access ("/proc/self/exe", R_OK) != 0)
     FAIL_UNSUPPORTED ("unsupported: Cannot read binary from procfs\n");
 
-  outfd = open (execname, O_WRONLY | O_CREAT | O_EXCL, 0700);
-  TEST_VERIFY (outfd >= 0);
-  if (support_record_failure_is_failed ())
-    goto err;
-
-  char buf[4096];
-  for (;;)
-    {
-      ssize_t rdcount = read (infd, buf, sizeof (buf));
-      TEST_VERIFY (rdcount >= 0);
-      if (support_record_failure_is_failed ())
-	goto err;
-      if (rdcount == 0)
-	break;
-      char *p = buf;
-      char *end = buf + rdcount;
-      while (p != end)
-	{
-	  ssize_t wrcount = write (outfd, buf, end - p);
-	  if (wrcount == 0)
-	    errno = ENOSPC;
-	  TEST_VERIFY (wrcount > 0);
-	  if (support_record_failure_is_failed ())
-	    goto err;
-	  p += wrcount;
-	}
-    }
+  support_copy_file ("/proc/self/exe", execname);
 
-  bool chowned = false;
-  TEST_VERIFY ((chowned = fchown (outfd, getuid (), gid) == 0)
-	       || errno == EPERM);
-  if (support_record_failure_is_failed ())
-    goto err;
-  else if (!chowned)
-    {
-      ret = 77;
-      goto err;
-    }
+  if (chown (execname, getuid (), gid) != 0)
+    FAIL_UNSUPPORTED ("cannot change group of \"%s\" to %jd: %m",
+		      execname, (intmax_t) gid);
 
-  TEST_VERIFY (fchmod (outfd, 02750) == 0);
-  if (support_record_failure_is_failed ())
-    goto err;
-  TEST_VERIFY (close (outfd) == 0);
-  if (support_record_failure_is_failed ())
-    goto err;
-  TEST_VERIFY (close (infd) == 0);
-  if (support_record_failure_is_failed ())
-    goto err;
+  if (chmod (execname, 02750) != 0)
+    FAIL_UNSUPPORTED ("cannot make \"%s\" SGID: %m ", execname);
 
   /* We have the binary, now spawn the subprocess.  Avoid using
      support_subprogram because we only want the program exit status, not the
      contents.  */
-  ret = 0;
-  infd = outfd = -1;
 
-  char * const args[] = {execname, child_id, NULL};
+  char * const args[] = {execname, (char *) child_id, NULL};
+  int status = support_subprogram_wait (args[0], args);
 
-  status = support_subprogram_wait (args[0], args);
+  free (execname);
+  free (dirname);
 
-err:
-  if (outfd >= 0)
-    close (outfd);
-  if (infd >= 0)
-    close (infd);
-  if (execname != NULL)
+  if (WIFEXITED (status))
     {
-      unlink (execname);
-      free (execname);
+      if (WEXITSTATUS (status) == 0)
+	return;
+      else
+	exit (WEXITSTATUS (status));
     }
-  if (dirname != NULL)
+  else
+    FAIL_EXIT1 ("subprogram failed with status %d", status);
+}
+
+/* Returns true if a group with NAME has been found, and writes its
+   GID to *TARGET.  */
+static bool
+find_sgid_group (gid_t *target, const char *name)
+{
+  /* Do not use getgrname_r because it does not work in statically
+     linked binaries if the system libc is different.  */
+  FILE *fp = fopen ("/etc/group", "rce");
+  if (fp == NULL)
+    return false;
+  __fsetlocking (fp, FSETLOCKING_BYCALLER);
+
+  bool ok = false;
+  struct scratch_buffer buf;
+  scratch_buffer_init (&buf);
+  while (true)
     {
-      rmdir (dirname);
-      free (dirname);
+      struct group grp;
+      struct group *result = NULL;
+      int status = fgetgrent_r (fp, &grp, buf.data, buf.length, &result);
+      if (status == 0 && result != NULL)
+	{
+	  if (strcmp (result->gr_name, name) == 0)
+	    {
+	      *target = result->gr_gid;
+	      ok = true;
+	      break;
+	    }
+	}
+      else if (errno != ERANGE)
+	break;
+      else if (!scratch_buffer_grow (&buf))
+	break;
     }
-
-  if (ret == 77)
-    FAIL_UNSUPPORTED ("Failed to make sgid executable for test\n");
-  if (ret != 0)
-    FAIL_EXIT1 ("Failed to make sgid executable for test\n");
-
-  return status;
+  scratch_buffer_free (&buf);
+  fclose (fp);
+  return ok;
 }
 
-int
-support_capture_subprogram_self_sgid (char *child_id)
+void
+support_capture_subprogram_self_sgid (const char *child_id)
 {
-  gid_t target = 0;
   const int count = 64;
   gid_t groups[count];
 
@@ -225,6 +207,7 @@ support_capture_subprogram_self_sgid (char *child_id)
 		     (intmax_t) getuid ());
 
   gid_t current = getgid ();
+  gid_t target = current;
   for (int i = 0; i < ret; ++i)
     {
       if (groups[i] != current)
@@ -234,11 +217,18 @@ support_capture_subprogram_self_sgid (char *child_id)
 	}
     }
 
-  if (target == 0)
-    FAIL_UNSUPPORTED("Could not find a suitable GID for user %jd\n",
-		     (intmax_t) getuid ());
+  if (target == current)
+    {
+      /* If running as root, try to find a harmless group for SGID.  */
+      if (getuid () != 0
+	  || (!find_sgid_group (&target, "nogroup")
+	      && !find_sgid_group (&target, "bin")
+	      && !find_sgid_group (&target, "daemon")))
+	FAIL_UNSUPPORTED("Could not find a suitable GID for user %jd\n",
+			 (intmax_t) getuid ());
+    }
 
-  return copy_and_spawn_sgid (child_id, target);
+  copy_and_spawn_sgid (child_id, target);
 }
 
 void
diff --git a/sysdeps/aarch64/fpu/acos_advsimd.c b/sysdeps/aarch64/fpu/acos_advsimd.c
index 7709b5454..453f78031 100644
--- a/sysdeps/aarch64/fpu/acos_advsimd.c
+++ b/sysdeps/aarch64/fpu/acos_advsimd.c
@@ -18,24 +18,23 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include "v_math.h"
-#include "poly_advsimd_f64.h"
 
 static const struct data
 {
-  float64x2_t poly[12];
-  float64x2_t pi, pi_over_2;
+  double c1, c3, c5, c7, c9, c11;
+  float64x2_t c0, c2, c4, c6, c8, c10;
   uint64x2_t abs_mask;
+  float64x2_t pi, pi_over_2;
 } data = {
   /* Polynomial approximation of  (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))
      on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57.  */
-  .poly = { V2 (0x1.555555555554ep-3), V2 (0x1.3333333337233p-4),
-	    V2 (0x1.6db6db67f6d9fp-5), V2 (0x1.f1c71fbd29fbbp-6),
-	    V2 (0x1.6e8b264d467d6p-6), V2 (0x1.1c5997c357e9dp-6),
-	    V2 (0x1.c86a22cd9389dp-7), V2 (0x1.856073c22ebbep-7),
-	    V2 (0x1.fd1151acb6bedp-8), V2 (0x1.087182f799c1dp-6),
-	    V2 (-0x1.6602748120927p-7), V2 (0x1.cfa0dd1f9478p-6), },
-  .pi = V2 (0x1.921fb54442d18p+1),
-  .pi_over_2 = V2 (0x1.921fb54442d18p+0),
+  .c0 = V2 (0x1.555555555554ep-3),     .c1 = 0x1.3333333337233p-4,
+  .c2 = V2 (0x1.6db6db67f6d9fp-5),     .c3 = 0x1.f1c71fbd29fbbp-6,
+  .c4 = V2 (0x1.6e8b264d467d6p-6),     .c5 = 0x1.1c5997c357e9dp-6,
+  .c6 = V2 (0x1.c86a22cd9389dp-7),     .c7 = 0x1.856073c22ebbep-7,
+  .c8 = V2 (0x1.fd1151acb6bedp-8),     .c9 = 0x1.087182f799c1dp-6,
+  .c10 = V2 (-0x1.6602748120927p-7),   .c11 = 0x1.cfa0dd1f9478p-6,
+  .pi = V2 (0x1.921fb54442d18p+1),     .pi_over_2 = V2 (0x1.921fb54442d18p+0),
   .abs_mask = V2 (0x7fffffffffffffff),
 };
 
@@ -63,7 +62,7 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
 
      acos(x) ~ pi/2 - (x + x^3 P(x^2)).
 
-   The largest observed error in this region is 1.18 ulps,
+   The largest observed error in this region is 1.18 ulp:
    _ZGVnN2v_acos (0x1.fbab0a7c460f6p-2) got 0x1.0d54d1985c068p+0
 				       want 0x1.0d54d1985c069p+0.
 
@@ -71,9 +70,9 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
 
      acos(x) = y + y * z * P(z), with  z = (1-x)/2 and y = sqrt(z).
 
-   The largest observed error in this region is 1.52 ulps,
-   _ZGVnN2v_acos (0x1.23d362722f591p-1) got 0x1.edbbedf8a7d6ep-1
-				       want 0x1.edbbedf8a7d6cp-1.  */
+   The largest observed error in this region is 1.50 ulp:
+   _ZGVnN2v_acos (0x1.252a2cf3fb9acp-1) got 0x1.ec1a46aa82901p-1
+				       want 0x1.ec1a46aa829p-1.  */
 float64x2_t VPCS_ATTR V_NAME_D1 (acos) (float64x2_t x)
 {
   const struct data *d = ptr_barrier (&data);
@@ -99,13 +98,32 @@ float64x2_t VPCS_ATTR V_NAME_D1 (acos) (float64x2_t x)
   float64x2_t z = vbslq_f64 (a_le_half, ax, vsqrtq_f64 (z2));
 
   /* Use a single polynomial approximation P for both intervals.  */
+  float64x2_t z3 = vmulq_f64 (z2, z);
   float64x2_t z4 = vmulq_f64 (z2, z2);
   float64x2_t z8 = vmulq_f64 (z4, z4);
-  float64x2_t z16 = vmulq_f64 (z8, z8);
-  float64x2_t p = v_estrin_11_f64 (z2, z4, z8, z16, d->poly);
 
-  /* Finalize polynomial: z + z * z2 * P(z2).  */
-  p = vfmaq_f64 (z, vmulq_f64 (z, z2), p);
+  /* Order-11 Estrin.  */
+  float64x2_t c13 = vld1q_f64 (&d->c1);
+  float64x2_t c57 = vld1q_f64 (&d->c5);
+  float64x2_t c911 = vld1q_f64 (&d->c9);
+
+  float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
+  float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
+  float64x2_t p03 = vfmaq_f64 (p01, z4, p23);
+
+  float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
+  float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
+  float64x2_t p47 = vfmaq_f64 (p45, z4, p67);
+
+  float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
+  float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
+  float64x2_t p811 = vfmaq_f64 (p89, z4, p1011);
+
+  float64x2_t p411 = vfmaq_f64 (p47, z8, p811);
+  float64x2_t p = vfmaq_f64 (p03, z8, p411);
+
+  /* Finalize polynomial: z + z3 * P(z2).  */
+  p = vfmaq_f64 (z, z3, p);
 
   /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for  |x| < 0.5
 	       = 2 Q(|x|)               , for  0.5 < x < 1.0
diff --git a/sysdeps/aarch64/fpu/acos_sve.c b/sysdeps/aarch64/fpu/acos_sve.c
index 74e2f7df0..104f0d780 100644
--- a/sysdeps/aarch64/fpu/acos_sve.c
+++ b/sysdeps/aarch64/fpu/acos_sve.c
@@ -18,20 +18,21 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include "sv_math.h"
-#include "poly_sve_f64.h"
 
 static const struct data
 {
-  float64_t poly[12];
-  float64_t pi, pi_over_2;
+  float64_t c1, c3, c5, c7, c9, c11;
+  float64_t c0, c2, c4, c6, c8, c10;
+  float64_t pi_over_2;
 } data = {
   /* Polynomial approximation of  (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))
      on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57.  */
-  .poly = { 0x1.555555555554ep-3, 0x1.3333333337233p-4, 0x1.6db6db67f6d9fp-5,
-	    0x1.f1c71fbd29fbbp-6, 0x1.6e8b264d467d6p-6, 0x1.1c5997c357e9dp-6,
-	    0x1.c86a22cd9389dp-7, 0x1.856073c22ebbep-7, 0x1.fd1151acb6bedp-8,
-	    0x1.087182f799c1dp-6, -0x1.6602748120927p-7, 0x1.cfa0dd1f9478p-6, },
-  .pi = 0x1.921fb54442d18p+1,
+  .c0 = 0x1.555555555554ep-3,	     .c1 = 0x1.3333333337233p-4,
+  .c2 = 0x1.6db6db67f6d9fp-5,	     .c3 = 0x1.f1c71fbd29fbbp-6,
+  .c4 = 0x1.6e8b264d467d6p-6,	     .c5 = 0x1.1c5997c357e9dp-6,
+  .c6 = 0x1.c86a22cd9389dp-7,	     .c7 = 0x1.856073c22ebbep-7,
+  .c8 = 0x1.fd1151acb6bedp-8,	     .c9 = 0x1.087182f799c1dp-6,
+  .c10 = -0x1.6602748120927p-7,	     .c11 = 0x1.cfa0dd1f9478p-6,
   .pi_over_2 = 0x1.921fb54442d18p+0,
 };
 
@@ -42,20 +43,21 @@ static const struct data
 
      acos(x) ~ pi/2 - (x + x^3 P(x^2)).
 
-   The largest observed error in this region is 1.18 ulps,
-   _ZGVsMxv_acos (0x1.fbc5fe28ee9e3p-2) got 0x1.0d4d0f55667f6p+0
-				       want 0x1.0d4d0f55667f7p+0.
+   The largest observed error in this region is 1.18 ulp:
+   _ZGVsMxv_acos (0x1.fbb7c9079b429p-2) got 0x1.0d51266607582p+0
+				       want 0x1.0d51266607583p+0.
 
    For |x| in [0.5, 1.0], use same approximation with a change of variable
 
      acos(x) = y + y * z * P(z), with  z = (1-x)/2 and y = sqrt(z).
 
-   The largest observed error in this region is 1.52 ulps,
-   _ZGVsMxv_acos (0x1.24024271a500ap-1) got 0x1.ed82df4243f0dp-1
-				       want 0x1.ed82df4243f0bp-1.  */
+   The largest observed error in this region is 1.50 ulp:
+   _ZGVsMxv_acos (0x1.252a2cf3fb9acp-1) got 0x1.ec1a46aa82901p-1
+				       want 0x1.ec1a46aa829p-1.  */
 svfloat64_t SV_NAME_D1 (acos) (svfloat64_t x, const svbool_t pg)
 {
   const struct data *d = ptr_barrier (&data);
+  svbool_t ptrue = svptrue_b64 ();
 
   svuint64_t sign = svand_x (pg, svreinterpret_u64 (x), 0x8000000000000000);
   svfloat64_t ax = svabs_x (pg, x);
@@ -70,24 +72,41 @@ svfloat64_t SV_NAME_D1 (acos) (svfloat64_t x, const svbool_t pg)
   svfloat64_t z = svsqrt_m (ax, a_gt_half, z2);
 
   /* Use a single polynomial approximation P for both intervals.  */
-  svfloat64_t z4 = svmul_x (pg, z2, z2);
-  svfloat64_t z8 = svmul_x (pg, z4, z4);
-  svfloat64_t z16 = svmul_x (pg, z8, z8);
-  svfloat64_t p = sv_estrin_11_f64_x (pg, z2, z4, z8, z16, d->poly);
+  svfloat64_t z3 = svmul_x (ptrue, z2, z);
+  svfloat64_t z4 = svmul_x (ptrue, z2, z2);
+  svfloat64_t z8 = svmul_x (ptrue, z4, z4);
+
+  svfloat64_t c13 = svld1rq (ptrue, &d->c1);
+  svfloat64_t c57 = svld1rq (ptrue, &d->c5);
+  svfloat64_t c911 = svld1rq (ptrue, &d->c9);
+
+  svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), z2, c13, 0);
+  svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), z2, c13, 1);
+  svfloat64_t p03 = svmla_x (pg, p01, z4, p23);
+
+  svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), z2, c57, 0);
+  svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), z2, c57, 1);
+  svfloat64_t p47 = svmla_x (pg, p45, z4, p67);
+
+  svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), z2, c911, 0);
+  svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), z2, c911, 1);
+  svfloat64_t p811 = svmla_x (pg, p89, z4, p1011);
+
+  svfloat64_t p411 = svmla_x (pg, p47, z8, p811);
+  svfloat64_t p = svmad_x (pg, p411, z8, p03);
 
   /* Finalize polynomial: z + z * z2 * P(z2).  */
-  p = svmla_x (pg, z, svmul_x (pg, z, z2), p);
+  p = svmad_x (pg, p, z3, z);
 
   /* acos(|x|) = pi/2 - sign(x) * Q(|x|), for  |x| < 0.5
 	       = 2 Q(|x|)               , for  0.5 < x < 1.0
 	       = pi - 2 Q(|x|)          , for -1.0 < x < -0.5.  */
-  svfloat64_t y
-      = svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (p), sign));
-
-  svbool_t is_neg = svcmplt (pg, x, 0.0);
-  svfloat64_t off = svdup_f64_z (is_neg, d->pi);
-  svfloat64_t mul = svsel (a_gt_half, sv_f64 (2.0), sv_f64 (-1.0));
-  svfloat64_t add = svsel (a_gt_half, off, sv_f64 (d->pi_over_2));
-
-  return svmla_x (pg, add, mul, y);
+  svfloat64_t mul = svreinterpret_f64 (
+      svlsl_m (a_gt_half, svreinterpret_u64 (sv_f64 (1.0)), 10));
+  mul = svreinterpret_f64 (sveor_x (ptrue, svreinterpret_u64 (mul), sign));
+  svfloat64_t add = svreinterpret_f64 (
+      svorr_x (ptrue, sign, svreinterpret_u64 (sv_f64 (d->pi_over_2))));
+  add = svsub_m (a_gt_half, sv_f64 (d->pi_over_2), add);
+
+  return svmsb_x (pg, p, mul, add);
 }
diff --git a/sysdeps/aarch64/fpu/acosh_sve.c b/sysdeps/aarch64/fpu/acosh_sve.c
index 326b2cca2..3a84959f0 100644
--- a/sysdeps/aarch64/fpu/acosh_sve.c
+++ b/sysdeps/aarch64/fpu/acosh_sve.c
@@ -30,10 +30,10 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
 }
 
 /* SVE approximation for double-precision acosh, based on log1p.
-   The largest observed error is 3.19 ULP in the region where the
+   The largest observed error is 3.14 ULP in the region where the
    argument to log1p falls in the k=0 interval, i.e. x close to 1:
-   SV_NAME_D1 (acosh)(0x1.1e4388d4ca821p+0) got 0x1.ed23399f5137p-2
-					   want 0x1.ed23399f51373p-2.  */
+   SV_NAME_D1 (acosh)(0x1.1e80ed12f0ad1p+0) got 0x1.ef0cee7c33ce1p-2
+					   want 0x1.ef0cee7c33ce4p-2.  */
 svfloat64_t SV_NAME_D1 (acosh) (svfloat64_t x, const svbool_t pg)
 {
   /* (ix - One) >= (BigBound - One).  */
diff --git a/sysdeps/aarch64/fpu/asin_advsimd.c b/sysdeps/aarch64/fpu/asin_advsimd.c
index 414211627..f74141c84 100644
--- a/sysdeps/aarch64/fpu/asin_advsimd.c
+++ b/sysdeps/aarch64/fpu/asin_advsimd.c
@@ -18,24 +18,23 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include "v_math.h"
-#include "poly_advsimd_f64.h"
 
 static const struct data
 {
-  float64x2_t poly[12];
+  float64x2_t c0, c2, c4, c6, c8, c10;
   float64x2_t pi_over_2;
   uint64x2_t abs_mask;
+  double c1, c3, c5, c7, c9, c11;
 } data = {
   /* Polynomial approximation of  (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))
      on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57.  */
-  .poly = { V2 (0x1.555555555554ep-3), V2 (0x1.3333333337233p-4),
-	    V2 (0x1.6db6db67f6d9fp-5), V2 (0x1.f1c71fbd29fbbp-6),
-	    V2 (0x1.6e8b264d467d6p-6), V2 (0x1.1c5997c357e9dp-6),
-	    V2 (0x1.c86a22cd9389dp-7), V2 (0x1.856073c22ebbep-7),
-	    V2 (0x1.fd1151acb6bedp-8), V2 (0x1.087182f799c1dp-6),
-	    V2 (-0x1.6602748120927p-7), V2 (0x1.cfa0dd1f9478p-6), },
-  .pi_over_2 = V2 (0x1.921fb54442d18p+0),
-  .abs_mask = V2 (0x7fffffffffffffff),
+  .c0 = V2 (0x1.555555555554ep-3),	  .c1 = 0x1.3333333337233p-4,
+  .c2 = V2 (0x1.6db6db67f6d9fp-5),	  .c3 = 0x1.f1c71fbd29fbbp-6,
+  .c4 = V2 (0x1.6e8b264d467d6p-6),	  .c5 = 0x1.1c5997c357e9dp-6,
+  .c6 = V2 (0x1.c86a22cd9389dp-7),	  .c7 = 0x1.856073c22ebbep-7,
+  .c8 = V2 (0x1.fd1151acb6bedp-8),	  .c9 = 0x1.087182f799c1dp-6,
+  .c10 = V2 (-0x1.6602748120927p-7),	  .c11 = 0x1.cfa0dd1f9478p-6,
+  .pi_over_2 = V2 (0x1.921fb54442d18p+0), .abs_mask = V2 (0x7fffffffffffffff),
 };
 
 #define AllMask v_u64 (0xffffffffffffffff)
@@ -68,8 +67,8 @@ special_case (float64x2_t x, float64x2_t y, uint64x2_t special)
      asin(x) = pi/2 - (y + y * z * P(z)), with  z = (1-x)/2 and y = sqrt(z).
 
    The largest observed error in this region is 2.69 ulps,
-   _ZGVnN2v_asin (0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1
-				       want 0x1.110d7e85fdd53p-1.  */
+   _ZGVnN2v_asin (0x1.044e8cefee301p-1) got 0x1.1111dd54ddf96p-1
+				       want 0x1.1111dd54ddf99p-1.  */
 float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x)
 {
   const struct data *d = ptr_barrier (&data);
@@ -86,7 +85,7 @@ float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x)
     return special_case (x, x, AllMask);
 #endif
 
-  uint64x2_t a_lt_half = vcltq_f64 (ax, v_f64 (0.5));
+  uint64x2_t a_lt_half = vcaltq_f64 (x, v_f64 (0.5));
 
   /* Evaluate polynomial Q(x) = y + y * z * P(z) with
      z = x ^ 2 and y = |x|            , if |x| < 0.5
@@ -99,7 +98,26 @@ float64x2_t VPCS_ATTR V_NAME_D1 (asin) (float64x2_t x)
   float64x2_t z4 = vmulq_f64 (z2, z2);
   float64x2_t z8 = vmulq_f64 (z4, z4);
   float64x2_t z16 = vmulq_f64 (z8, z8);
-  float64x2_t p = v_estrin_11_f64 (z2, z4, z8, z16, d->poly);
+
+  /* order-11 estrin.  */
+  float64x2_t c13 = vld1q_f64 (&d->c1);
+  float64x2_t c57 = vld1q_f64 (&d->c5);
+  float64x2_t c911 = vld1q_f64 (&d->c9);
+
+  float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
+  float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
+  float64x2_t p03 = vfmaq_f64 (p01, z4, p23);
+
+  float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
+  float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
+  float64x2_t p47 = vfmaq_f64 (p45, z4, p67);
+
+  float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
+  float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
+  float64x2_t p811 = vfmaq_f64 (p89, z4, p1011);
+
+  float64x2_t p07 = vfmaq_f64 (p03, z8, p47);
+  float64x2_t p = vfmaq_f64 (p07, z16, p811);
 
   /* Finalize polynomial: z + z * z2 * P(z2).  */
   p = vfmaq_f64 (z, vmulq_f64 (z, z2), p);
diff --git a/sysdeps/aarch64/fpu/asin_sve.c b/sysdeps/aarch64/fpu/asin_sve.c
index 9314466f5..975f408be 100644
--- a/sysdeps/aarch64/fpu/asin_sve.c
+++ b/sysdeps/aarch64/fpu/asin_sve.c
@@ -18,45 +18,43 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include "sv_math.h"
-#include "poly_sve_f64.h"
 
 static const struct data
 {
-  float64_t poly[12];
-  float64_t pi_over_2f;
+  float64_t c1, c3, c5, c7, c9, c11;
+  float64_t c0, c2, c4, c6, c8, c10;
+  float64_t pi_over_2;
 } data = {
   /* Polynomial approximation of  (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))
      on [ 0x1p-106, 0x1p-2 ], relative error: 0x1.c3d8e169p-57.  */
-  .poly = { 0x1.555555555554ep-3, 0x1.3333333337233p-4,
-	    0x1.6db6db67f6d9fp-5, 0x1.f1c71fbd29fbbp-6,
-	    0x1.6e8b264d467d6p-6, 0x1.1c5997c357e9dp-6,
-	    0x1.c86a22cd9389dp-7, 0x1.856073c22ebbep-7,
-	    0x1.fd1151acb6bedp-8, 0x1.087182f799c1dp-6,
-	    -0x1.6602748120927p-7, 0x1.cfa0dd1f9478p-6, },
-  .pi_over_2f = 0x1.921fb54442d18p+0,
+  .c0 = 0x1.555555555554ep-3,	     .c1 = 0x1.3333333337233p-4,
+  .c2 = 0x1.6db6db67f6d9fp-5,	     .c3 = 0x1.f1c71fbd29fbbp-6,
+  .c4 = 0x1.6e8b264d467d6p-6,	     .c5 = 0x1.1c5997c357e9dp-6,
+  .c6 = 0x1.c86a22cd9389dp-7,	     .c7 = 0x1.856073c22ebbep-7,
+  .c8 = 0x1.fd1151acb6bedp-8,	     .c9 = 0x1.087182f799c1dp-6,
+  .c10 = -0x1.6602748120927p-7,	     .c11 = 0x1.cfa0dd1f9478p-6,
+  .pi_over_2 = 0x1.921fb54442d18p+0,
 };
 
-#define P(i) sv_f64 (d->poly[i])
-
 /* Double-precision SVE implementation of vector asin(x).
 
    For |x| in [0, 0.5], use an order 11 polynomial P such that the final
    approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2).
 
-   The largest observed error in this region is 0.52 ulps,
-   _ZGVsMxv_asin(0x1.d95ae04998b6cp-2) got 0x1.ec13757305f27p-2
-				      want 0x1.ec13757305f26p-2.
-
-   For |x| in [0.5, 1.0], use same approximation with a change of variable
+   The largest observed error in this region is 0.98 ulp:
+   _ZGVsMxv_asin (0x1.d98f6a748ed8ap-2) got 0x1.ec4eb661a73d3p-2
+				       want 0x1.ec4eb661a73d2p-2.
 
-     asin(x) = pi/2 - (y + y * z * P(z)), with  z = (1-x)/2 and y = sqrt(z).
+   For |x| in [0.5, 1.0], use same approximation with a change of variable:
+   asin(x) = pi/2 - (y + y * z * P(z)), with  z = (1-x)/2 and y = sqrt(z).
 
-   The largest observed error in this region is 2.69 ulps,
-   _ZGVsMxv_asin(0x1.044ac9819f573p-1) got 0x1.110d7e85fdd5p-1
-				      want 0x1.110d7e85fdd53p-1.  */
+   The largest observed error in this region is 2.66 ulp:
+   _ZGVsMxv_asin (0x1.04024f6e2a2fbp-1) got 0x1.10b9586f087a8p-1
+				       want 0x1.10b9586f087abp-1.  */
 svfloat64_t SV_NAME_D1 (asin) (svfloat64_t x, const svbool_t pg)
 {
   const struct data *d = ptr_barrier (&data);
+  svbool_t ptrue = svptrue_b64 ();
 
   svuint64_t sign = svand_x (pg, svreinterpret_u64 (x), 0x8000000000000000);
   svfloat64_t ax = svabs_x (pg, x);
@@ -70,17 +68,37 @@ svfloat64_t SV_NAME_D1 (asin) (svfloat64_t x, const svbool_t pg)
   svfloat64_t z = svsqrt_m (ax, a_ge_half, z2);
 
   /* Use a single polynomial approximation P for both intervals.  */
+  svfloat64_t z3 = svmul_x (pg, z2, z);
   svfloat64_t z4 = svmul_x (pg, z2, z2);
   svfloat64_t z8 = svmul_x (pg, z4, z4);
-  svfloat64_t z16 = svmul_x (pg, z8, z8);
-  svfloat64_t p = sv_estrin_11_f64_x (pg, z2, z4, z8, z16, d->poly);
+
+  svfloat64_t c13 = svld1rq (ptrue, &d->c1);
+  svfloat64_t c57 = svld1rq (ptrue, &d->c5);
+  svfloat64_t c911 = svld1rq (ptrue, &d->c9);
+
+  /* Order-11 Estrin scheme.  */
+  svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), z2, c13, 0);
+  svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), z2, c13, 1);
+  svfloat64_t p03 = svmla_x (pg, p01, z4, p23);
+
+  svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), z2, c57, 0);
+  svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), z2, c57, 1);
+  svfloat64_t p47 = svmla_x (pg, p45, z4, p67);
+
+  svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), z2, c911, 0);
+  svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), z2, c911, 1);
+  svfloat64_t p811 = svmla_x (pg, p89, z4, p1011);
+
+  svfloat64_t p411 = svmla_x (pg, p47, z8, p811);
+  svfloat64_t p = svmla_x (pg, p03, z8, p411);
+
   /* Finalize polynomial: z + z * z2 * P(z2).  */
-  p = svmla_x (pg, z, svmul_x (pg, z, z2), p);
+  p = svmla_x (pg, z, z3, p);
 
-  /* asin(|x|) = Q(|x|)         , for |x| < 0.5
-	       = pi/2 - 2 Q(|x|), for |x| >= 0.5.  */
-  svfloat64_t y = svmad_m (a_ge_half, p, sv_f64 (-2.0), d->pi_over_2f);
+  /* asin(|x|) = Q(|x|), for |x| <  0.5
+	    = pi/2 - 2 Q(|x|), for |x| >= 0.5.  */
+  svfloat64_t y = svmad_m (a_ge_half, p, sv_f64 (-2.0), d->pi_over_2);
 
-  /* Copy sign.  */
+  /* Reinsert the sign from the argument.  */
   return svreinterpret_f64 (svorr_x (pg, svreinterpret_u64 (y), sign));
 }
diff --git a/sysdeps/aarch64/fpu/asinf_advsimd.c b/sysdeps/aarch64/fpu/asinf_advsimd.c
index 52c7c0ec6..013936c2c 100644
--- a/sysdeps/aarch64/fpu/asinf_advsimd.c
+++ b/sysdeps/aarch64/fpu/asinf_advsimd.c
@@ -18,22 +18,21 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include "v_math.h"
-#include "poly_advsimd_f32.h"
 
 static const struct data
 {
-  float32x4_t poly[5];
+  float32x4_t c0, c2, c4;
+  float c1, c3;
   float32x4_t pi_over_2f;
 } data = {
   /* Polynomial approximation of  (asin(sqrt(x)) - sqrt(x)) / (x * sqrt(x))  on
      [ 0x1p-24 0x1p-2 ] order = 4 rel error: 0x1.00a23bbp-29 .  */
-  .poly = { V4 (0x1.55555ep-3), V4 (0x1.33261ap-4), V4 (0x1.70d7dcp-5),
-	    V4 (0x1.b059dp-6), V4 (0x1.3af7d8p-5) },
-  .pi_over_2f = V4 (0x1.921fb6p+0f),
+  .c0 = V4 (0x1.55555ep-3f), .c1 = 0x1.33261ap-4f,
+  .c2 = V4 (0x1.70d7dcp-5f), .c3 = 0x1.b059dp-6f,
+  .c4 = V4 (0x1.3af7d8p-5f), .pi_over_2f = V4 (0x1.921fb6p+0f),
 };
 
 #define AbsMask 0x7fffffff
-#define Half 0x3f000000
 #define One 0x3f800000
 #define Small 0x39800000 /* 2^-12.  */
 
@@ -47,11 +46,8 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
 
 /* Single-precision implementation of vector asin(x).
 
-   For |x| < Small, approximate asin(x) by x. Small = 2^-12 for correct
-   rounding. If WANT_SIMD_EXCEPT = 0, Small = 0 and we proceed with the
-   following approximation.
 
-   For |x| in [Small, 0.5], use order 4 polynomial P such that the final
+   For |x| <0.5, use order 4 polynomial P such that the final
    approximation is an odd polynomial: asin(x) ~ x + x^3 P(x^2).
 
     The largest observed error in this region is 0.83 ulps,
@@ -80,24 +76,31 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (asin) (float32x4_t x)
 #endif
 
   float32x4_t ax = vreinterpretq_f32_u32 (ia);
-  uint32x4_t a_lt_half = vcltq_u32 (ia, v_u32 (Half));
+  uint32x4_t a_lt_half = vcaltq_f32 (x, v_f32 (0.5f));
 
   /* Evaluate polynomial Q(x) = y + y * z * P(z) with
      z = x ^ 2 and y = |x|            , if |x| < 0.5
      z = (1 - |x|) / 2 and y = sqrt(z), if |x| >= 0.5.  */
   float32x4_t z2 = vbslq_f32 (a_lt_half, vmulq_f32 (x, x),
-			      vfmsq_n_f32 (v_f32 (0.5), ax, 0.5));
+			      vfmsq_n_f32 (v_f32 (0.5f), ax, 0.5f));
   float32x4_t z = vbslq_f32 (a_lt_half, ax, vsqrtq_f32 (z2));
 
   /* Use a single polynomial approximation P for both intervals.  */
-  float32x4_t p = v_horner_4_f32 (z2, d->poly);
+
+  /* PW Horner 3 evaluation scheme.  */
+  float32x4_t z4 = vmulq_f32 (z2, z2);
+  float32x4_t c13 = vld1q_f32 (&d->c1);
+  float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, c13, 0);
+  float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, c13, 1);
+  float32x4_t p = vfmaq_f32 (p23, d->c4, z4);
+  p = vfmaq_f32 (p01, p, z4);
   /* Finalize polynomial: z + z * z2 * P(z2).  */
   p = vfmaq_f32 (z, vmulq_f32 (z, z2), p);
 
   /* asin(|x|) = Q(|x|)         , for |x| < 0.5
 	       = pi/2 - 2 Q(|x|), for |x| >= 0.5.  */
   float32x4_t y
-      = vbslq_f32 (a_lt_half, p, vfmsq_n_f32 (d->pi_over_2f, p, 2.0));
+      = vbslq_f32 (a_lt_half, p, vfmsq_n_f32 (d->pi_over_2f, p, 2.0f));
 
   /* Copy sign.  */
   return vbslq_f32 (v_u32 (AbsMask), y, x);
diff --git a/sysdeps/aarch64/fpu/asinh_sve.c b/sysdeps/aarch64/fpu/asinh_sve.c
index 0889f79db..ff6b71390 100644
--- a/sysdeps/aarch64/fpu/asinh_sve.c
+++ b/sysdeps/aarch64/fpu/asinh_sve.c
@@ -18,36 +18,49 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include "sv_math.h"
-#include "poly_sve_f64.h"
 
 #define SignMask (0x8000000000000000)
 #define One (0x3ff0000000000000)
 #define Thres (0x5fe0000000000000) /* asuint64 (0x1p511).  */
+#define IndexMask (((1 << V_LOG_TABLE_BITS) - 1) << 1)
 
 static const struct data
 {
-  double poly[18];
-  double ln2, p3, p1, p4, p0, p2;
-  uint64_t n;
-  uint64_t off;
+  double even_coeffs[9];
+  double ln2, p3, p1, p4, p0, p2, c1, c3, c5, c7, c9, c11, c13, c15, c17;
+  uint64_t off, mask;
 
 } data = {
-  /* Polynomial generated using Remez on [2^-26, 1].  */
-  .poly
-  = { -0x1.55555555554a7p-3, 0x1.3333333326c7p-4, -0x1.6db6db68332e6p-5,
-      0x1.f1c71b26fb40dp-6, -0x1.6e8b8b654a621p-6, 0x1.1c4daa9e67871p-6,
-      -0x1.c9871d10885afp-7, 0x1.7a16e8d9d2ecfp-7, -0x1.3ddca533e9f54p-7,
-      0x1.0becef748dafcp-7, -0x1.b90c7099dd397p-8, 0x1.541f2bb1ffe51p-8,
-      -0x1.d217026a669ecp-9, 0x1.0b5c7977aaf7p-9, -0x1.e0f37daef9127p-11,
-      0x1.388b5fe542a6p-12, -0x1.021a48685e287p-14, 0x1.93d4ba83d34dap-18 },
+   /* Polynomial generated using Remez on [2^-26, 1].  */
+  .even_coeffs ={
+    -0x1.55555555554a7p-3,
+    -0x1.6db6db68332e6p-5,
+    -0x1.6e8b8b654a621p-6,
+    -0x1.c9871d10885afp-7,
+    -0x1.3ddca533e9f54p-7,
+    -0x1.b90c7099dd397p-8,
+    -0x1.d217026a669ecp-9,
+    -0x1.e0f37daef9127p-11,
+    -0x1.021a48685e287p-14, },
+
+  .c1 = 0x1.3333333326c7p-4,
+  .c3 = 0x1.f1c71b26fb40dp-6,
+  .c5 = 0x1.1c4daa9e67871p-6,
+  .c7 = 0x1.7a16e8d9d2ecfp-7,
+  .c9 = 0x1.0becef748dafcp-7,
+  .c11 = 0x1.541f2bb1ffe51p-8,
+  .c13 = 0x1.0b5c7977aaf7p-9,
+  .c15 = 0x1.388b5fe542a6p-12,
+  .c17 = 0x1.93d4ba83d34dap-18,
+
   .ln2 = 0x1.62e42fefa39efp-1,
   .p0 = -0x1.ffffffffffff7p-2,
   .p1 = 0x1.55555555170d4p-2,
   .p2 = -0x1.0000000399c27p-2,
   .p3 = 0x1.999b2e90e94cap-3,
   .p4 = -0x1.554e550bd501ep-3,
-  .n = 1 << V_LOG_TABLE_BITS,
-  .off = 0x3fe6900900000000
+  .off = 0x3fe6900900000000,
+  .mask = 0xfffULL << 52,
 };
 
 static svfloat64_t NOINLINE
@@ -64,11 +77,10 @@ __sv_log_inline (svfloat64_t x, const struct data *d, const svbool_t pg)
      of the algorithm used.  */
 
   svuint64_t ix = svreinterpret_u64 (x);
-  svuint64_t tmp = svsub_x (pg, ix, d->off);
-  svuint64_t i = svand_x (pg, svlsr_x (pg, tmp, (51 - V_LOG_TABLE_BITS)),
-			  (d->n - 1) << 1);
-  svint64_t k = svasr_x (pg, svreinterpret_s64 (tmp), 52);
-  svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, 0xfffULL << 52));
+  svuint64_t i_off = svsub_x (pg, ix, d->off);
+  svuint64_t i
+      = svand_x (pg, svlsr_x (pg, i_off, (51 - V_LOG_TABLE_BITS)), IndexMask);
+  svuint64_t iz = svsub_x (pg, ix, svand_x (pg, i_off, d->mask));
   svfloat64_t z = svreinterpret_f64 (iz);
 
   svfloat64_t invc = svld1_gather_index (pg, &__v_log_data.table[0].invc, i);
@@ -78,14 +90,14 @@ __sv_log_inline (svfloat64_t x, const struct data *d, const svbool_t pg)
   svfloat64_t p1_p4 = svld1rq (svptrue_b64 (), &d->p1);
 
   svfloat64_t r = svmla_x (pg, sv_f64 (-1.0), invc, z);
-  svfloat64_t kd = svcvt_f64_x (pg, k);
+  svfloat64_t kd
+      = svcvt_f64_x (pg, svasr_x (pg, svreinterpret_s64 (i_off), 52));
 
   svfloat64_t hi = svmla_lane (svadd_x (pg, logc, r), kd, ln2_p3, 0);
-  svfloat64_t r2 = svmul_x (pg, r, r);
-
+  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
   svfloat64_t y = svmla_lane (sv_f64 (d->p2), r, ln2_p3, 1);
-
   svfloat64_t p = svmla_lane (sv_f64 (d->p0), r, p1_p4, 0);
+
   y = svmla_lane (y, r2, p1_p4, 1);
   y = svmla_x (pg, p, r2, y);
   y = svmla_x (pg, hi, r2, y);
@@ -111,7 +123,6 @@ svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg)
   svuint64_t iax = svbic_x (pg, ix, SignMask);
   svuint64_t sign = svand_x (pg, ix, SignMask);
   svfloat64_t ax = svreinterpret_f64 (iax);
-
   svbool_t ge1 = svcmpge (pg, iax, One);
   svbool_t special = svcmpge (pg, iax, Thres);
 
@@ -120,7 +131,7 @@ svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg)
   svfloat64_t option_1 = sv_f64 (0);
   if (__glibc_likely (svptest_any (pg, ge1)))
     {
-      svfloat64_t x2 = svmul_x (pg, ax, ax);
+      svfloat64_t x2 = svmul_x (svptrue_b64 (), ax, ax);
       option_1 = __sv_log_inline (
 	  svadd_x (pg, ax, svsqrt_x (pg, svadd_x (pg, x2, 1))), d, pg);
     }
@@ -130,21 +141,53 @@ svfloat64_t SV_NAME_D1 (asinh) (svfloat64_t x, const svbool_t pg)
      The largest observed error in this region is 1.51 ULPs:
      _ZGVsMxv_asinh(0x1.fe12bf8c616a2p-1) got 0x1.c1e649ee2681bp-1
 					 want 0x1.c1e649ee2681dp-1.  */
+
   svfloat64_t option_2 = sv_f64 (0);
   if (__glibc_likely (svptest_any (pg, svnot_z (pg, ge1))))
     {
-      svfloat64_t x2 = svmul_x (pg, ax, ax);
-      svfloat64_t x4 = svmul_x (pg, x2, x2);
-      svfloat64_t p = sv_pw_horner_17_f64_x (pg, x2, x4, d->poly);
-      option_2 = svmla_x (pg, ax, p, svmul_x (pg, x2, ax));
+      svfloat64_t x2 = svmul_x (svptrue_b64 (), ax, ax);
+      svfloat64_t x4 = svmul_x (svptrue_b64 (), x2, x2);
+      /* Order-17 Pairwise Horner scheme.  */
+      svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
+      svfloat64_t c57 = svld1rq (svptrue_b64 (), &d->c5);
+      svfloat64_t c911 = svld1rq (svptrue_b64 (), &d->c9);
+      svfloat64_t c1315 = svld1rq (svptrue_b64 (), &d->c13);
+
+      svfloat64_t p01 = svmla_lane (sv_f64 (d->even_coeffs[0]), x2, c13, 0);
+      svfloat64_t p23 = svmla_lane (sv_f64 (d->even_coeffs[1]), x2, c13, 1);
+      svfloat64_t p45 = svmla_lane (sv_f64 (d->even_coeffs[2]), x2, c57, 0);
+      svfloat64_t p67 = svmla_lane (sv_f64 (d->even_coeffs[3]), x2, c57, 1);
+      svfloat64_t p89 = svmla_lane (sv_f64 (d->even_coeffs[4]), x2, c911, 0);
+      svfloat64_t p1011 = svmla_lane (sv_f64 (d->even_coeffs[5]), x2, c911, 1);
+      svfloat64_t p1213
+	  = svmla_lane (sv_f64 (d->even_coeffs[6]), x2, c1315, 0);
+      svfloat64_t p1415
+	  = svmla_lane (sv_f64 (d->even_coeffs[7]), x2, c1315, 1);
+      svfloat64_t p1617 = svmla_x (pg, sv_f64 (d->even_coeffs[8]), x2, d->c17);
+
+      svfloat64_t p = svmla_x (pg, p1415, x4, p1617);
+      p = svmla_x (pg, p1213, x4, p);
+      p = svmla_x (pg, p1011, x4, p);
+      p = svmla_x (pg, p89, x4, p);
+
+      p = svmla_x (pg, p67, x4, p);
+      p = svmla_x (pg, p45, x4, p);
+
+      p = svmla_x (pg, p23, x4, p);
+
+      p = svmla_x (pg, p01, x4, p);
+
+      option_2 = svmla_x (pg, ax, p, svmul_x (svptrue_b64 (), x2, ax));
     }
 
-  /* Choose the right option for each lane.  */
-  svfloat64_t y = svsel (ge1, option_1, option_2);
-
   if (__glibc_unlikely (svptest_any (pg, special)))
     return special_case (
-	x, svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign)),
+	x,
+	svreinterpret_f64 (sveor_x (
+	    pg, svreinterpret_u64 (svsel (ge1, option_1, option_2)), sign)),
 	special);
+
+  /* Choose the right option for each lane.  */
+  svfloat64_t y = svsel (ge1, option_1, option_2);
   return svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign));
 }
diff --git a/sysdeps/aarch64/fpu/atan2_advsimd.c b/sysdeps/aarch64/fpu/atan2_advsimd.c
index 00b4a4f08..a31d52f3a 100644
--- a/sysdeps/aarch64/fpu/atan2_advsimd.c
+++ b/sysdeps/aarch64/fpu/atan2_advsimd.c
@@ -19,40 +19,38 @@
 
 #include "math_config.h"
 #include "v_math.h"
-#include "poly_advsimd_f64.h"
 
 static const struct data
 {
+  double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
   float64x2_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18;
   float64x2_t pi_over_2;
-  double c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
-  uint64x2_t zeroinfnan, minustwo;
+  uint64x2_t zeroinfnan;
 } data = {
-  /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
-	      [2**-1022, 1.0].  */
-  .c0 = V2 (-0x1.5555555555555p-2),
-  .c1 = 0x1.99999999996c1p-3,
-  .c2 = V2 (-0x1.2492492478f88p-3),
-  .c3 = 0x1.c71c71bc3951cp-4,
-  .c4 = V2 (-0x1.745d160a7e368p-4),
-  .c5 = 0x1.3b139b6a88ba1p-4,
-  .c6 = V2 (-0x1.11100ee084227p-4),
-  .c7 = 0x1.e1d0f9696f63bp-5,
-  .c8 = V2 (-0x1.aebfe7b418581p-5),
-  .c9 = 0x1.842dbe9b0d916p-5,
-  .c10 = V2 (-0x1.5d30140ae5e99p-5),
-  .c11 = 0x1.338e31eb2fbbcp-5,
-  .c12 = V2 (-0x1.00e6eece7de8p-5),
-  .c13 = 0x1.860897b29e5efp-6,
-  .c14 = V2 (-0x1.0051381722a59p-6),
-  .c15 = 0x1.14e9dc19a4a4ep-7,
-  .c16 = V2 (-0x1.d0062b42fe3bfp-9),
-  .c17 = 0x1.17739e210171ap-10,
-  .c18 = V2 (-0x1.ab24da7be7402p-13),
-  .c19 = 0x1.358851160a528p-16,
+  /* Coefficients of polynomial P such that
+     atan(x)~x+x*P(x^2) on [2^-1022, 1.0].  */
+  .c0 = V2 (-0x1.555555555552ap-2),
+  .c1 = 0x1.9999999995aebp-3,
+  .c2 = V2 (-0x1.24924923923f6p-3),
+  .c3 = 0x1.c71c7184288a2p-4,
+  .c4 = V2 (-0x1.745d11fb3d32bp-4),
+  .c5 = 0x1.3b136a18051b9p-4,
+  .c6 = V2 (-0x1.110e6d985f496p-4),
+  .c7 = 0x1.e1bcf7f08801dp-5,
+  .c8 = V2 (-0x1.ae644e28058c3p-5),
+  .c9 = 0x1.82eeb1fed85c6p-5,
+  .c10 = V2 (-0x1.59d7f901566cbp-5),
+  .c11 = 0x1.2c982855ab069p-5,
+  .c12 = V2 (-0x1.eb49592998177p-6),
+  .c13 = 0x1.69d8b396e3d38p-6,
+  .c14 = V2 (-0x1.ca980345c4204p-7),
+  .c15 = 0x1.dc050eafde0b3p-8,
+  .c16 = V2 (-0x1.7ea70755b8eccp-9),
+  .c17 = 0x1.ba3da3de903e8p-11,
+  .c18 = V2 (-0x1.44a4b059b6f67p-13),
+  .c19 = 0x1.c4a45029e5a91p-17,
   .pi_over_2 = V2 (0x1.921fb54442d18p+0),
   .zeroinfnan = V2 (2 * 0x7ff0000000000000ul - 1),
-  .minustwo = V2 (0xc000000000000000),
 };
 
 #define SignMask v_u64 (0x8000000000000000)
@@ -77,10 +75,9 @@ zeroinfnan (uint64x2_t i, const struct data *d)
 }
 
 /* Fast implementation of vector atan2.
-   Maximum observed error is 2.8 ulps:
-   _ZGVnN2vv_atan2 (0x1.9651a429a859ap+5, 0x1.953075f4ee26p+5)
-	got 0x1.92d628ab678ccp-1
-       want 0x1.92d628ab678cfp-1.  */
+   Maximum observed error is 1.97 ulps:
+   _ZGVnN2vv_atan2 (0x1.42337dba73768p+5, 0x1.422d748cd3e29p+5)
+   got 0x1.9224810264efcp-1 want 0x1.9224810264efep-1.  */
 float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x)
 {
   const struct data *d = ptr_barrier (&data);
@@ -101,26 +98,29 @@ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x)
   uint64x2_t pred_xlt0 = vcltzq_f64 (x);
   uint64x2_t pred_aygtax = vcagtq_f64 (y, x);
 
-  /* Set up z for call to atan.  */
-  float64x2_t n = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay);
-  float64x2_t q = vbslq_f64 (pred_aygtax, ay, ax);
-  float64x2_t z = vdivq_f64 (n, q);
-
-  /* Work out the correct shift.  */
-  float64x2_t shift
-      = vreinterpretq_f64_u64 (vandq_u64 (pred_xlt0, d->minustwo));
-  shift = vbslq_f64 (pred_aygtax, vaddq_f64 (shift, v_f64 (1.0)), shift);
-  shift = vmulq_f64 (shift, d->pi_over_2);
-
-  /* Calculate the polynomial approximation.
-     Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of
-     full scheme to avoid underflow in x^16.
-     The order 19 polynomial P approximates
-     (atan(sqrt(x))-sqrt(x))/x^(3/2).  */
+  /* Set up z for evaluation of atan.  */
+  float64x2_t num = vbslq_f64 (pred_aygtax, vnegq_f64 (ax), ay);
+  float64x2_t den = vbslq_f64 (pred_aygtax, ay, ax);
+  float64x2_t z = vdivq_f64 (num, den);
+
+  /* Work out the correct shift for atan2:
+     Multiplication by pi is done later.
+     -pi   when x < 0  and ax < ay
+     -pi/2 when x < 0  and ax > ay
+      0    when x >= 0 and ax < ay
+      pi/2 when x >= 0 and ax > ay.  */
+  float64x2_t shift = vreinterpretq_f64_u64 (
+      vandq_u64 (pred_xlt0, vreinterpretq_u64_f64 (v_f64 (-2.0))));
+  float64x2_t shift2 = vreinterpretq_f64_u64 (
+      vandq_u64 (pred_aygtax, vreinterpretq_u64_f64 (v_f64 (1.0))));
+  shift = vaddq_f64 (shift, shift2);
+
+  /* Calculate the polynomial approximation.  */
   float64x2_t z2 = vmulq_f64 (z, z);
-  float64x2_t x2 = vmulq_f64 (z2, z2);
-  float64x2_t x4 = vmulq_f64 (x2, x2);
-  float64x2_t x8 = vmulq_f64 (x4, x4);
+  float64x2_t z3 = vmulq_f64 (z2, z);
+  float64x2_t z4 = vmulq_f64 (z2, z2);
+  float64x2_t z8 = vmulq_f64 (z4, z4);
+  float64x2_t z16 = vmulq_f64 (z8, z8);
 
   float64x2_t c13 = vld1q_f64 (&d->c1);
   float64x2_t c57 = vld1q_f64 (&d->c5);
@@ -128,45 +128,43 @@ float64x2_t VPCS_ATTR V_NAME_D2 (atan2) (float64x2_t y, float64x2_t x)
   float64x2_t c1315 = vld1q_f64 (&d->c13);
   float64x2_t c1719 = vld1q_f64 (&d->c17);
 
-  /* estrin_7.  */
+  /* Order-7 Estrin.  */
   float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
   float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
-  float64x2_t p03 = vfmaq_f64 (p01, x2, p23);
+  float64x2_t p03 = vfmaq_f64 (p01, z4, p23);
 
   float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
   float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
-  float64x2_t p47 = vfmaq_f64 (p45, x2, p67);
+  float64x2_t p47 = vfmaq_f64 (p45, z4, p67);
 
-  float64x2_t p07 = vfmaq_f64 (p03, x4, p47);
+  float64x2_t p07 = vfmaq_f64 (p03, z8, p47);
 
-  /* estrin_11.  */
+  /* Order-11 Estrin.  */
   float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
   float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
-  float64x2_t p811 = vfmaq_f64 (p89, x2, p1011);
+  float64x2_t p811 = vfmaq_f64 (p89, z4, p1011);
 
   float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0);
   float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1);
-  float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415);
+  float64x2_t p1215 = vfmaq_f64 (p1213, z4, p1415);
 
   float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0);
   float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1);
-  float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819);
+  float64x2_t p1619 = vfmaq_f64 (p1617, z4, p1819);
 
-  float64x2_t p815 = vfmaq_f64 (p811, x4, p1215);
-  float64x2_t p819 = vfmaq_f64 (p815, x8, p1619);
+  float64x2_t p815 = vfmaq_f64 (p811, z8, p1215);
+  float64x2_t p819 = vfmaq_f64 (p815, z16, p1619);
 
-  float64x2_t ret = vfmaq_f64 (p07, p819, x8);
+  float64x2_t poly = vfmaq_f64 (p07, p819, z16);
 
   /* Finalize. y = shift + z + z^3 * P(z^2).  */
-  ret = vfmaq_f64 (z, ret, vmulq_f64 (z2, z));
-  ret = vaddq_f64 (ret, shift);
+  float64x2_t ret = vfmaq_f64 (z, shift, d->pi_over_2);
+  ret = vfmaq_f64 (ret, z3, poly);
 
   if (__glibc_unlikely (v_any_u64 (special_cases)))
     return special_case (y, x, ret, sign_xy, special_cases);
 
   /* Account for the sign of x and y.  */
-  ret = vreinterpretq_f64_u64 (
+  return vreinterpretq_f64_u64 (
       veorq_u64 (vreinterpretq_u64_f64 (ret), sign_xy));
-
-  return ret;
 }
diff --git a/sysdeps/aarch64/fpu/atan2_sve.c b/sysdeps/aarch64/fpu/atan2_sve.c
index 163f61308..9e2dd249d 100644
--- a/sysdeps/aarch64/fpu/atan2_sve.c
+++ b/sysdeps/aarch64/fpu/atan2_sve.c
@@ -19,25 +19,25 @@
 
 #include "math_config.h"
 #include "sv_math.h"
-#include "poly_sve_f64.h"
 
 static const struct data
 {
-  float64_t poly[20];
-  float64_t pi_over_2;
+  float64_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18;
+  float64_t c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
 } data = {
   /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
      [2**-1022, 1.0].  */
-  .poly = { -0x1.5555555555555p-2,  0x1.99999999996c1p-3, -0x1.2492492478f88p-3,
-            0x1.c71c71bc3951cp-4,   -0x1.745d160a7e368p-4, 0x1.3b139b6a88ba1p-4,
-            -0x1.11100ee084227p-4,  0x1.e1d0f9696f63bp-5, -0x1.aebfe7b418581p-5,
-            0x1.842dbe9b0d916p-5,   -0x1.5d30140ae5e99p-5, 0x1.338e31eb2fbbcp-5,
-            -0x1.00e6eece7de8p-5,   0x1.860897b29e5efp-6, -0x1.0051381722a59p-6,
-            0x1.14e9dc19a4a4ep-7,  -0x1.d0062b42fe3bfp-9, 0x1.17739e210171ap-10,
-            -0x1.ab24da7be7402p-13, 0x1.358851160a528p-16, },
-  .pi_over_2 = 0x1.921fb54442d18p+0,
+  .c0 = -0x1.555555555552ap-2,	 .c1 = 0x1.9999999995aebp-3,
+  .c2 = -0x1.24924923923f6p-3,	 .c3 = 0x1.c71c7184288a2p-4,
+  .c4 = -0x1.745d11fb3d32bp-4,	 .c5 = 0x1.3b136a18051b9p-4,
+  .c6 = -0x1.110e6d985f496p-4,	 .c7 = 0x1.e1bcf7f08801dp-5,
+  .c8 = -0x1.ae644e28058c3p-5,	 .c9 = 0x1.82eeb1fed85c6p-5,
+  .c10 = -0x1.59d7f901566cbp-5,	 .c11 = 0x1.2c982855ab069p-5,
+  .c12 = -0x1.eb49592998177p-6,	 .c13 = 0x1.69d8b396e3d38p-6,
+  .c14 = -0x1.ca980345c4204p-7,	 .c15 = 0x1.dc050eafde0b3p-8,
+  .c16 = -0x1.7ea70755b8eccp-9,	 .c17 = 0x1.ba3da3de903e8p-11,
+  .c18 = -0x1.44a4b059b6f67p-13, .c19 = 0x1.c4a45029e5a91p-17,
 };
-
 /* Special cases i.e. 0, infinity, nan (fall back to scalar calls).  */
 static svfloat64_t NOINLINE
 special_case (svfloat64_t y, svfloat64_t x, svfloat64_t ret,
@@ -56,15 +56,17 @@ zeroinfnan (svuint64_t i, const svbool_t pg)
 }
 
 /* Fast implementation of SVE atan2. Errors are greatest when y and
-   x are reasonably close together. The greatest observed error is 2.28 ULP:
-   _ZGVsMxvv_atan2 (-0x1.5915b1498e82fp+732, 0x1.54d11ef838826p+732)
-   got -0x1.954f42f1fa841p-1 want -0x1.954f42f1fa843p-1.  */
-svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg)
+   x are reasonably close together. The greatest observed error is 1.94 ULP:
+   _ZGVsMxvv_atan2 (0x1.8a4bf7167228ap+5, 0x1.84971226bb57bp+5)
+   got 0x1.95db19dfef9ccp-1 want 0x1.95db19dfef9cep-1.  */
+svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x,
+				const svbool_t pg)
 {
-  const struct data *data_ptr = ptr_barrier (&data);
+  const struct data *d = ptr_barrier (&data);
 
   svuint64_t ix = svreinterpret_u64 (x);
   svuint64_t iy = svreinterpret_u64 (y);
+  svbool_t ptrue = svptrue_b64 ();
 
   svbool_t cmp_x = zeroinfnan (ix, pg);
   svbool_t cmp_y = zeroinfnan (iy, pg);
@@ -81,32 +83,67 @@ svfloat64_t SV_NAME_D2 (atan2) (svfloat64_t y, svfloat64_t x, const svbool_t pg)
 
   svbool_t pred_aygtax = svcmpgt (pg, ay, ax);
 
-  /* Set up z for call to atan.  */
-  svfloat64_t n = svsel (pred_aygtax, svneg_x (pg, ax), ay);
-  svfloat64_t d = svsel (pred_aygtax, ay, ax);
-  svfloat64_t z = svdiv_x (pg, n, d);
-
-  /* Work out the correct shift.  */
+  /* Set up z for evaluation of atan.  */
+  svfloat64_t num = svsel (pred_aygtax, svneg_x (pg, ax), ay);
+  svfloat64_t den = svsel (pred_aygtax, ay, ax);
+  svfloat64_t z = svdiv_x (pg, num, den);
+
+  /* Work out the correct shift for atan2:
+     Multiplication by pi is done later.
+     -pi   when x < 0  and ax < ay
+     -pi/2 when x < 0  and ax > ay
+      0    when x >= 0 and ax < ay
+      pi/2 when x >= 0 and ax > ay.  */
   svfloat64_t shift = svreinterpret_f64 (svlsr_x (pg, sign_x, 1));
+  svfloat64_t shift_mul = svreinterpret_f64 (
+      svorr_x (pg, sign_x, svreinterpret_u64 (sv_f64 (0x1.921fb54442d18p+0))));
   shift = svsel (pred_aygtax, sv_f64 (1.0), shift);
-  shift = svreinterpret_f64 (svorr_x (pg, sign_x, svreinterpret_u64 (shift)));
-  shift = svmul_x (pg, shift, data_ptr->pi_over_2);
+  shift = svmla_x (pg, z, shift, shift_mul);
 
   /* Use split Estrin scheme for P(z^2) with deg(P)=19.  */
   svfloat64_t z2 = svmul_x (pg, z, z);
-  svfloat64_t x2 = svmul_x (pg, z2, z2);
-  svfloat64_t x4 = svmul_x (pg, x2, x2);
-  svfloat64_t x8 = svmul_x (pg, x4, x4);
+  svfloat64_t z3 = svmul_x (pg, z2, z);
+  svfloat64_t z4 = svmul_x (pg, z2, z2);
+  svfloat64_t z8 = svmul_x (pg, z4, z4);
+  svfloat64_t z16 = svmul_x (pg, z8, z8);
 
-  svfloat64_t ret = svmla_x (
-      pg, sv_estrin_7_f64_x (pg, z2, x2, x4, data_ptr->poly),
-      sv_estrin_11_f64_x (pg, z2, x2, x4, x8, data_ptr->poly + 8), x8);
+  /* Order-7 Estrin.  */
+  svfloat64_t c13 = svld1rq (ptrue, &d->c1);
+  svfloat64_t c57 = svld1rq (ptrue, &d->c5);
 
-  /* y = shift + z + z^3 * P(z^2).  */
-  svfloat64_t z3 = svmul_x (pg, z2, z);
-  ret = svmla_x (pg, z, z3, ret);
+  svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), z2, c13, 0);
+  svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), z2, c13, 1);
+  svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), z2, c57, 0);
+  svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), z2, c57, 1);
+
+  svfloat64_t p03 = svmla_x (pg, p01, z4, p23);
+  svfloat64_t p47 = svmla_x (pg, p45, z4, p67);
+  svfloat64_t p07 = svmla_x (pg, p03, z8, p47);
+
+  /* Order-11 Estrin.  */
+  svfloat64_t c911 = svld1rq (ptrue, &d->c9);
+  svfloat64_t c1315 = svld1rq (ptrue, &d->c13);
+  svfloat64_t c1719 = svld1rq (ptrue, &d->c17);
 
-  ret = svadd_m (pg, ret, shift);
+  svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), z2, c911, 0);
+  svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), z2, c911, 1);
+  svfloat64_t p811 = svmla_x (pg, p89, z4, p1011);
+
+  svfloat64_t p1213 = svmla_lane (sv_f64 (d->c12), z2, c1315, 0);
+  svfloat64_t p1415 = svmla_lane (sv_f64 (d->c14), z2, c1315, 1);
+  svfloat64_t p1215 = svmla_x (pg, p1213, z4, p1415);
+
+  svfloat64_t p1617 = svmla_lane (sv_f64 (d->c16), z2, c1719, 0);
+  svfloat64_t p1819 = svmla_lane (sv_f64 (d->c18), z2, c1719, 1);
+  svfloat64_t p1619 = svmla_x (pg, p1617, z4, p1819);
+
+  svfloat64_t p815 = svmla_x (pg, p811, z8, p1215);
+  svfloat64_t p819 = svmla_x (pg, p815, z16, p1619);
+
+  svfloat64_t poly = svmla_x (pg, p07, z16, p819);
+
+  /* y = shift + z + z^3 * P(z^2).  */
+  svfloat64_t ret = svmla_x (pg, shift, z3, poly);
 
   /* Account for the sign of x and y.  */
   if (__glibc_unlikely (svptest_any (pg, cmp_xy)))
diff --git a/sysdeps/aarch64/fpu/atan2f_advsimd.c b/sysdeps/aarch64/fpu/atan2f_advsimd.c
index e65406f49..75d873897 100644
--- a/sysdeps/aarch64/fpu/atan2f_advsimd.c
+++ b/sysdeps/aarch64/fpu/atan2f_advsimd.c
@@ -18,22 +18,22 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include "v_math.h"
-#include "poly_advsimd_f32.h"
 
 static const struct data
 {
-  float32x4_t c0, pi_over_2, c4, c6, c2;
+  float32x4_t c0, c4, c6, c2;
   float c1, c3, c5, c7;
   uint32x4_t comp_const;
+  float32x4_t pi;
 } data = {
   /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
      [2**-128, 1.0].
      Generated using fpminimax between FLT_MIN and 1.  */
-  .c0 = V4 (-0x1.55555p-2f),	    .c1 = 0x1.99935ep-3f,
-  .c2 = V4 (-0x1.24051ep-3f),	    .c3 = 0x1.bd7368p-4f,
-  .c4 = V4 (-0x1.491f0ep-4f),	    .c5 = 0x1.93a2c0p-5f,
-  .c6 = V4 (-0x1.4c3c60p-6f),	    .c7 = 0x1.01fd88p-8f,
-  .pi_over_2 = V4 (0x1.921fb6p+0f), .comp_const = V4 (2 * 0x7f800000lu - 1),
+  .c0 = V4 (-0x1.5554dcp-2), .c1 = 0x1.9978ecp-3,
+  .c2 = V4 (-0x1.230a94p-3), .c3 = 0x1.b4debp-4,
+  .c4 = V4 (-0x1.3550dap-4), .c5 = 0x1.61eebp-5,
+  .c6 = V4 (-0x1.0c17d4p-6), .c7 = 0x1.7ea694p-9,
+  .pi = V4 (0x1.921fb6p+1f), .comp_const = V4 (2 * 0x7f800000lu - 1),
 };
 
 #define SignMask v_u32 (0x80000000)
@@ -54,13 +54,13 @@ static inline uint32x4_t
 zeroinfnan (uint32x4_t i, const struct data *d)
 {
   /* 2 * i - 1 >= 2 * 0x7f800000lu - 1.  */
-  return vcgeq_u32 (vsubq_u32 (vmulq_n_u32 (i, 2), v_u32 (1)), d->comp_const);
+  return vcgeq_u32 (vsubq_u32 (vshlq_n_u32 (i, 1), v_u32 (1)), d->comp_const);
 }
 
 /* Fast implementation of vector atan2f. Maximum observed error is
-   2.95 ULP in [0x1.9300d6p+6 0x1.93c0c6p+6] x [0x1.8c2dbp+6 0x1.8cea6p+6]:
-   _ZGVnN4vv_atan2f (0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1
-						 want 0x1.967f00p-1.  */
+   2.13 ULP in [0x1.9300d6p+6 0x1.93c0c6p+6] x [0x1.8c2dbp+6 0x1.8cea6p+6]:
+   _ZGVnN4vv_atan2f (0x1.14a9d4p-87, 0x1.0eb886p-87) got 0x1.97aea2p-1
+						    want 0x1.97ae9ep-1.  */
 float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
 {
   const struct data *d = ptr_barrier (&data);
@@ -81,28 +81,31 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
   uint32x4_t pred_xlt0 = vcltzq_f32 (x);
   uint32x4_t pred_aygtax = vcgtq_f32 (ay, ax);
 
-  /* Set up z for call to atanf.  */
-  float32x4_t n = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay);
-  float32x4_t q = vbslq_f32 (pred_aygtax, ay, ax);
-  float32x4_t z = vdivq_f32 (n, q);
-
-  /* Work out the correct shift.  */
+  /* Set up z for evaluation of atanf.  */
+  float32x4_t num = vbslq_f32 (pred_aygtax, vnegq_f32 (ax), ay);
+  float32x4_t den = vbslq_f32 (pred_aygtax, ay, ax);
+  float32x4_t z = vdivq_f32 (num, den);
+
+  /* Work out the correct shift for atan2:
+     Multiplication by pi is done later.
+     -pi   when x < 0  and ax < ay
+     -pi/2 when x < 0  and ax > ay
+      0    when x >= 0 and ax < ay
+      pi/2 when x >= 0 and ax > ay.  */
   float32x4_t shift = vreinterpretq_f32_u32 (
-      vandq_u32 (pred_xlt0, vreinterpretq_u32_f32 (v_f32 (-2.0f))));
-  shift = vbslq_f32 (pred_aygtax, vaddq_f32 (shift, v_f32 (1.0f)), shift);
-  shift = vmulq_f32 (shift, d->pi_over_2);
-
-  /* Calculate the polynomial approximation.
-     Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However,
-     a standard implementation using z8 creates spurious underflow
-     in the very last fma (when z^8 is small enough).
-     Therefore, we split the last fma into a mul and an fma.
-     Horner and single-level Estrin have higher errors that exceed
-     threshold.  */
+      vandq_u32 (pred_xlt0, vreinterpretq_u32_f32 (v_f32 (-1.0f))));
+  float32x4_t shift2 = vreinterpretq_f32_u32 (
+      vandq_u32 (pred_aygtax, vreinterpretq_u32_f32 (v_f32 (0.5f))));
+  shift = vaddq_f32 (shift, shift2);
+
+  /* Calculate the polynomial approximation.  */
   float32x4_t z2 = vmulq_f32 (z, z);
+  float32x4_t z3 = vmulq_f32 (z2, z);
   float32x4_t z4 = vmulq_f32 (z2, z2);
+  float32x4_t z8 = vmulq_f32 (z4, z4);
 
   float32x4_t c1357 = vld1q_f32 (&d->c1);
+
   float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, c1357, 0);
   float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, c1357, 1);
   float32x4_t p45 = vfmaq_laneq_f32 (d->c4, z2, c1357, 2);
@@ -110,10 +113,11 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F2 (atan2) (float32x4_t y, float32x4_t x)
   float32x4_t p03 = vfmaq_f32 (p01, z4, p23);
   float32x4_t p47 = vfmaq_f32 (p45, z4, p67);
 
-  float32x4_t ret = vfmaq_f32 (p03, z4, vmulq_f32 (z4, p47));
+  float32x4_t poly = vfmaq_f32 (p03, z8, p47);
 
   /* y = shift + z * P(z^2).  */
-  ret = vaddq_f32 (vfmaq_f32 (z, ret, vmulq_f32 (z2, z)), shift);
+  float32x4_t ret = vfmaq_f32 (z, shift, d->pi);
+  ret = vfmaq_f32 (ret, z3, poly);
 
   if (__glibc_unlikely (v_any_u32 (special_cases)))
     {
diff --git a/sysdeps/aarch64/fpu/atan2f_sve.c b/sysdeps/aarch64/fpu/atan2f_sve.c
index 5f26e2a36..4d9341952 100644
--- a/sysdeps/aarch64/fpu/atan2f_sve.c
+++ b/sysdeps/aarch64/fpu/atan2f_sve.c
@@ -18,18 +18,18 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include "sv_math.h"
-#include "poly_sve_f32.h"
 
 static const struct data
 {
-  float32_t poly[8];
+  float32_t c0, c2, c4, c6;
+  float32_t c1, c3, c5, c7;
   float32_t pi_over_2;
 } data = {
   /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
      [2**-128, 1.0].  */
-  .poly = { -0x1.55555p-2f, 0x1.99935ep-3f, -0x1.24051ep-3f, 0x1.bd7368p-4f,
-	    -0x1.491f0ep-4f, 0x1.93a2c0p-5f, -0x1.4c3c60p-6f, 0x1.01fd88p-8f },
-  .pi_over_2 = 0x1.921fb6p+0f,
+  .c0 = -0x1.5554dcp-2, .c1 = 0x1.9978ecp-3,  .c2 = -0x1.230a94p-3,
+  .c3 = 0x1.b4debp-4,	.c4 = -0x1.3550dap-4, .c5 = 0x1.61eebp-5,
+  .c6 = -0x1.0c17d4p-6, .c7 = 0x1.7ea694p-9,  .pi_over_2 = 0x1.921fb6p+0f,
 };
 
 /* Special cases i.e. 0, infinity, nan (fall back to scalar calls).  */
@@ -51,12 +51,14 @@ zeroinfnan (svuint32_t i, const svbool_t pg)
 
 /* Fast implementation of SVE atan2f based on atan(x) ~ shift + z + z^3 *
    P(z^2) with reduction to [0,1] using z=1/x and shift = pi/2. Maximum
-   observed error is 2.95 ULP:
-   _ZGVsMxvv_atan2f (0x1.93836cp+6, 0x1.8cae1p+6) got 0x1.967f06p-1
-						 want 0x1.967f00p-1.  */
-svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg)
+   observed error is 2.21 ULP:
+   _ZGVnN4vv_atan2f (0x1.a04aa8p+6, 0x1.9a274p+6) got 0x1.95ed3ap-1
+						 want 0x1.95ed36p-1.  */
+svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x,
+				const svbool_t pg)
 {
-  const struct data *data_ptr = ptr_barrier (&data);
+  const struct data *d = ptr_barrier (&data);
+  svbool_t ptrue = svptrue_b32 ();
 
   svuint32_t ix = svreinterpret_u32 (x);
   svuint32_t iy = svreinterpret_u32 (y);
@@ -76,29 +78,42 @@ svfloat32_t SV_NAME_F2 (atan2) (svfloat32_t y, svfloat32_t x, const svbool_t pg)
 
   svbool_t pred_aygtax = svcmpgt (pg, ay, ax);
 
-  /* Set up z for call to atan.  */
-  svfloat32_t n = svsel (pred_aygtax, svneg_x (pg, ax), ay);
-  svfloat32_t d = svsel (pred_aygtax, ay, ax);
-  svfloat32_t z = svdiv_x (pg, n, d);
-
-  /* Work out the correct shift.  */
+  /* Set up z for evaluation of atanf.  */
+  svfloat32_t num = svsel (pred_aygtax, svneg_x (pg, ax), ay);
+  svfloat32_t den = svsel (pred_aygtax, ay, ax);
+  svfloat32_t z = svdiv_x (ptrue, num, den);
+
+  /* Work out the correct shift for atan2:
+     Multiplication by pi is done later.
+     -pi   when x < 0  and ax < ay
+     -pi/2 when x < 0  and ax > ay
+      0    when x >= 0 and ax < ay
+      pi/2 when x >= 0 and ax > ay.  */
   svfloat32_t shift = svreinterpret_f32 (svlsr_x (pg, sign_x, 1));
   shift = svsel (pred_aygtax, sv_f32 (1.0), shift);
   shift = svreinterpret_f32 (svorr_x (pg, sign_x, svreinterpret_u32 (shift)));
-  shift = svmul_x (pg, shift, sv_f32 (data_ptr->pi_over_2));
 
   /* Use pure Estrin scheme for P(z^2) with deg(P)=7.  */
-  svfloat32_t z2 = svmul_x (pg, z, z);
+  svfloat32_t z2 = svmul_x (ptrue, z, z);
+  svfloat32_t z3 = svmul_x (pg, z2, z);
   svfloat32_t z4 = svmul_x (pg, z2, z2);
   svfloat32_t z8 = svmul_x (pg, z4, z4);
 
-  svfloat32_t ret = sv_estrin_7_f32_x (pg, z2, z4, z8, data_ptr->poly);
+  svfloat32_t odd_coeffs = svld1rq (ptrue, &d->c1);
 
-  /* ret = shift + z + z^3 * P(z^2).  */
-  svfloat32_t z3 = svmul_x (pg, z2, z);
-  ret = svmla_x (pg, z, z3, ret);
+  svfloat32_t p01 = svmla_lane (sv_f32 (d->c0), z2, odd_coeffs, 0);
+  svfloat32_t p23 = svmla_lane (sv_f32 (d->c2), z2, odd_coeffs, 1);
+  svfloat32_t p45 = svmla_lane (sv_f32 (d->c4), z2, odd_coeffs, 2);
+  svfloat32_t p67 = svmla_lane (sv_f32 (d->c6), z2, odd_coeffs, 3);
 
-  ret = svadd_m (pg, ret, shift);
+  svfloat32_t p03 = svmla_x (pg, p01, z4, p23);
+  svfloat32_t p47 = svmla_x (pg, p45, z4, p67);
+
+  svfloat32_t poly = svmla_x (pg, p03, z8, p47);
+
+  /* ret = shift + z + z^3 * P(z^2).  */
+  svfloat32_t ret = svmla_x (pg, z, shift, sv_f32 (d->pi_over_2));
+  ret = svmla_x (pg, ret, z3, poly);
 
   /* Account for the sign of x and y.  */
 
diff --git a/sysdeps/aarch64/fpu/atan_advsimd.c b/sysdeps/aarch64/fpu/atan_advsimd.c
index f024fd1d7..da0d3715d 100644
--- a/sysdeps/aarch64/fpu/atan_advsimd.c
+++ b/sysdeps/aarch64/fpu/atan_advsimd.c
@@ -18,7 +18,6 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include "v_math.h"
-#include "poly_advsimd_f64.h"
 
 static const struct data
 {
@@ -28,16 +27,16 @@ static const struct data
 } data = {
   /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
 	      [2**-1022, 1.0].  */
-  .c0 = V2 (-0x1.5555555555555p-2),	  .c1 = 0x1.99999999996c1p-3,
-  .c2 = V2 (-0x1.2492492478f88p-3),	  .c3 = 0x1.c71c71bc3951cp-4,
-  .c4 = V2 (-0x1.745d160a7e368p-4),	  .c5 = 0x1.3b139b6a88ba1p-4,
-  .c6 = V2 (-0x1.11100ee084227p-4),	  .c7 = 0x1.e1d0f9696f63bp-5,
-  .c8 = V2 (-0x1.aebfe7b418581p-5),	  .c9 = 0x1.842dbe9b0d916p-5,
-  .c10 = V2 (-0x1.5d30140ae5e99p-5),	  .c11 = 0x1.338e31eb2fbbcp-5,
-  .c12 = V2 (-0x1.00e6eece7de8p-5),	  .c13 = 0x1.860897b29e5efp-6,
-  .c14 = V2 (-0x1.0051381722a59p-6),	  .c15 = 0x1.14e9dc19a4a4ep-7,
-  .c16 = V2 (-0x1.d0062b42fe3bfp-9),	  .c17 = 0x1.17739e210171ap-10,
-  .c18 = V2 (-0x1.ab24da7be7402p-13),	  .c19 = 0x1.358851160a528p-16,
+  .c0 = V2 (-0x1.555555555552ap-2),	  .c1 = 0x1.9999999995aebp-3,
+  .c2 = V2 (-0x1.24924923923f6p-3),	  .c3 = 0x1.c71c7184288a2p-4,
+  .c4 = V2 (-0x1.745d11fb3d32bp-4),	  .c5 = 0x1.3b136a18051b9p-4,
+  .c6 = V2 (-0x1.110e6d985f496p-4),	  .c7 = 0x1.e1bcf7f08801dp-5,
+  .c8 = V2 (-0x1.ae644e28058c3p-5),	  .c9 = 0x1.82eeb1fed85c6p-5,
+  .c10 = V2 (-0x1.59d7f901566cbp-5),	  .c11 = 0x1.2c982855ab069p-5,
+  .c12 = V2 (-0x1.eb49592998177p-6),	  .c13 = 0x1.69d8b396e3d38p-6,
+  .c14 = V2 (-0x1.ca980345c4204p-7),	  .c15 = 0x1.dc050eafde0b3p-8,
+  .c16 = V2 (-0x1.7ea70755b8eccp-9),	  .c17 = 0x1.ba3da3de903e8p-11,
+  .c18 = V2 (-0x1.44a4b059b6f67p-13),	  .c19 = 0x1.c4a45029e5a91p-17,
   .pi_over_2 = V2 (0x1.921fb54442d18p+0),
 };
 
@@ -47,9 +46,9 @@ static const struct data
 
 /* Fast implementation of vector atan.
    Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
-   z=1/x and shift = pi/2. Maximum observed error is 2.27 ulps:
-   _ZGVnN2v_atan (0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1
-				       want 0x1.9225645bdd7c3p-1.  */
+   z=1/x and shift = pi/2. Maximum observed error is 2.45 ulps:
+   _ZGVnN2v_atan (0x1.0008d737eb3e6p+0) got 0x1.92288c551a4c1p-1
+				       want 0x1.92288c551a4c3p-1.  */
 float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x)
 {
   const struct data *d = ptr_barrier (&data);
@@ -78,59 +77,53 @@ float64x2_t VPCS_ATTR V_NAME_D1 (atan) (float64x2_t x)
      y := arctan(x) for x < 1
      y := pi/2 + arctan(-1/x) for x > 1
      Hence, use z=-1/a if x>=1, otherwise z=a.  */
-  uint64x2_t red = vcagtq_f64 (x, v_f64 (1.0));
+  uint64x2_t red = vcagtq_f64 (x, v_f64 (-1.0));
   /* Avoid dependency in abs(x) in division (and comparison).  */
-  float64x2_t z = vbslq_f64 (red, vdivq_f64 (v_f64 (1.0), x), x);
+  float64x2_t z = vbslq_f64 (red, vdivq_f64 (v_f64 (-1.0), x), x);
+
   float64x2_t shift = vreinterpretq_f64_u64 (
       vandq_u64 (red, vreinterpretq_u64_f64 (d->pi_over_2)));
-  /* Use absolute value only when needed (odd powers of z).  */
-  float64x2_t az = vbslq_f64 (
-      SignMask, vreinterpretq_f64_u64 (vandq_u64 (SignMask, red)), z);
-
-  /* Calculate the polynomial approximation.
-     Use split Estrin scheme for P(z^2) with deg(P)=19. Use split instead of
-     full scheme to avoid underflow in x^16.
-     The order 19 polynomial P approximates
-     (atan(sqrt(x))-sqrt(x))/x^(3/2).  */
+
+  /* Reinsert sign bit from argument into the shift value.  */
+  shift = vreinterpretq_f64_u64 (
+      veorq_u64 (vreinterpretq_u64_f64 (shift), sign));
+
+  /* Calculate polynomial approximation P(z^2) with deg(P)=19.  */
   float64x2_t z2 = vmulq_f64 (z, z);
-  float64x2_t x2 = vmulq_f64 (z2, z2);
-  float64x2_t x4 = vmulq_f64 (x2, x2);
-  float64x2_t x8 = vmulq_f64 (x4, x4);
+  float64x2_t z4 = vmulq_f64 (z2, z2);
+  float64x2_t z8 = vmulq_f64 (z4, z4);
+  float64x2_t z16 = vmulq_f64 (z8, z8);
 
-  /* estrin_7.  */
+  /* Order-7 Estrin.  */
   float64x2_t p01 = vfmaq_laneq_f64 (d->c0, z2, c13, 0);
   float64x2_t p23 = vfmaq_laneq_f64 (d->c2, z2, c13, 1);
-  float64x2_t p03 = vfmaq_f64 (p01, x2, p23);
+  float64x2_t p03 = vfmaq_f64 (p01, z4, p23);
 
   float64x2_t p45 = vfmaq_laneq_f64 (d->c4, z2, c57, 0);
   float64x2_t p67 = vfmaq_laneq_f64 (d->c6, z2, c57, 1);
-  float64x2_t p47 = vfmaq_f64 (p45, x2, p67);
+  float64x2_t p47 = vfmaq_f64 (p45, z4, p67);
 
-  float64x2_t p07 = vfmaq_f64 (p03, x4, p47);
+  float64x2_t p07 = vfmaq_f64 (p03, z8, p47);
 
-  /* estrin_11.  */
+  /* Order-11 Estrin.  */
   float64x2_t p89 = vfmaq_laneq_f64 (d->c8, z2, c911, 0);
   float64x2_t p1011 = vfmaq_laneq_f64 (d->c10, z2, c911, 1);
-  float64x2_t p811 = vfmaq_f64 (p89, x2, p1011);
+  float64x2_t p811 = vfmaq_f64 (p89, z4, p1011);
 
   float64x2_t p1213 = vfmaq_laneq_f64 (d->c12, z2, c1315, 0);
   float64x2_t p1415 = vfmaq_laneq_f64 (d->c14, z2, c1315, 1);
-  float64x2_t p1215 = vfmaq_f64 (p1213, x2, p1415);
+  float64x2_t p1215 = vfmaq_f64 (p1213, z4, p1415);
 
   float64x2_t p1617 = vfmaq_laneq_f64 (d->c16, z2, c1719, 0);
   float64x2_t p1819 = vfmaq_laneq_f64 (d->c18, z2, c1719, 1);
-  float64x2_t p1619 = vfmaq_f64 (p1617, x2, p1819);
+  float64x2_t p1619 = vfmaq_f64 (p1617, z4, p1819);
 
-  float64x2_t p815 = vfmaq_f64 (p811, x4, p1215);
-  float64x2_t p819 = vfmaq_f64 (p815, x8, p1619);
+  float64x2_t p815 = vfmaq_f64 (p811, z8, p1215);
+  float64x2_t p819 = vfmaq_f64 (p815, z16, p1619);
 
-  float64x2_t y = vfmaq_f64 (p07, p819, x8);
+  float64x2_t y = vfmaq_f64 (p07, p819, z16);
 
   /* Finalize. y = shift + z + z^3 * P(z^2).  */
-  y = vfmaq_f64 (az, y, vmulq_f64 (z2, az));
-  y = vaddq_f64 (y, shift);
-
-  /* y = atan(x) if x>0, -atan(-x) otherwise.  */
-  y = vreinterpretq_f64_u64 (veorq_u64 (vreinterpretq_u64_f64 (y), sign));
-  return y;
+  y = vfmsq_f64 (v_f64 (-1.0), z2, y);
+  return vfmsq_f64 (shift, z, y);
 }
diff --git a/sysdeps/aarch64/fpu/atan_sve.c b/sysdeps/aarch64/fpu/atan_sve.c
index 3880cedff..a6b0489cf 100644
--- a/sysdeps/aarch64/fpu/atan_sve.c
+++ b/sysdeps/aarch64/fpu/atan_sve.c
@@ -18,23 +18,26 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include "sv_math.h"
-#include "poly_sve_f64.h"
 
 static const struct data
 {
-  float64_t poly[20];
-  float64_t pi_over_2;
+  float64_t c0, c2, c4, c6, c8, c10, c12, c14, c16, c18;
+  float64_t c1, c3, c5, c7, c9, c11, c13, c15, c17, c19;
+  float64_t shift_val, neg_one;
 } data = {
   /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
      [2**-1022, 1.0].  */
-  .poly = { -0x1.5555555555555p-2,  0x1.99999999996c1p-3, -0x1.2492492478f88p-3,
-            0x1.c71c71bc3951cp-4,   -0x1.745d160a7e368p-4, 0x1.3b139b6a88ba1p-4,
-            -0x1.11100ee084227p-4,  0x1.e1d0f9696f63bp-5, -0x1.aebfe7b418581p-5,
-            0x1.842dbe9b0d916p-5,   -0x1.5d30140ae5e99p-5, 0x1.338e31eb2fbbcp-5,
-            -0x1.00e6eece7de8p-5,   0x1.860897b29e5efp-6, -0x1.0051381722a59p-6,
-            0x1.14e9dc19a4a4ep-7,  -0x1.d0062b42fe3bfp-9, 0x1.17739e210171ap-10,
-            -0x1.ab24da7be7402p-13, 0x1.358851160a528p-16, },
-  .pi_over_2 = 0x1.921fb54442d18p+0,
+  .c0 = -0x1.555555555552ap-2,	     .c1 = 0x1.9999999995aebp-3,
+  .c2 = -0x1.24924923923f6p-3,	     .c3 = 0x1.c71c7184288a2p-4,
+  .c4 = -0x1.745d11fb3d32bp-4,	     .c5 = 0x1.3b136a18051b9p-4,
+  .c6 = -0x1.110e6d985f496p-4,	     .c7 = 0x1.e1bcf7f08801dp-5,
+  .c8 = -0x1.ae644e28058c3p-5,	     .c9 = 0x1.82eeb1fed85c6p-5,
+  .c10 = -0x1.59d7f901566cbp-5,	     .c11 = 0x1.2c982855ab069p-5,
+  .c12 = -0x1.eb49592998177p-6,	     .c13 = 0x1.69d8b396e3d38p-6,
+  .c14 = -0x1.ca980345c4204p-7,	     .c15 = 0x1.dc050eafde0b3p-8,
+  .c16 = -0x1.7ea70755b8eccp-9,	     .c17 = 0x1.ba3da3de903e8p-11,
+  .c18 = -0x1.44a4b059b6f67p-13,     .c19 = 0x1.c4a45029e5a91p-17,
+  .shift_val = 0x1.490fdaa22168cp+1, .neg_one = -1,
 };
 
 /* Useful constants.  */
@@ -43,15 +46,14 @@ static const struct data
 /* Fast implementation of SVE atan.
    Based on atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
    z=1/x and shift = pi/2. Largest errors are close to 1. The maximum observed
-   error is 2.27 ulps:
-   _ZGVsMxv_atan (0x1.0005af27c23e9p+0) got 0x1.9225645bdd7c1p-1
-				       want 0x1.9225645bdd7c3p-1.  */
+   error is 2.08 ulps:
+   _ZGVsMxv_atan (0x1.000a7c56975e8p+0) got 0x1.922a3163e15c2p-1
+				       want 0x1.922a3163e15c4p-1.  */
 svfloat64_t SV_NAME_D1 (atan) (svfloat64_t x, const svbool_t pg)
 {
   const struct data *d = ptr_barrier (&data);
 
-  /* No need to trigger special case. Small cases, infs and nans
-     are supported by our approximation technique.  */
+  svbool_t ptrue = svptrue_b64 ();
   svuint64_t ix = svreinterpret_u64 (x);
   svuint64_t sign = svand_x (pg, ix, SignMask);
 
@@ -59,32 +61,60 @@ svfloat64_t SV_NAME_D1 (atan) (svfloat64_t x, const svbool_t pg)
      y := arctan(x) for x < 1
      y := pi/2 + arctan(-1/x) for x > 1
      Hence, use z=-1/a if x>=1, otherwise z=a.  */
-  svbool_t red = svacgt (pg, x, 1.0);
-  /* Avoid dependency in abs(x) in division (and comparison).  */
-  svfloat64_t z = svsel (red, svdivr_x (pg, x, 1.0), x);
-  /* Use absolute value only when needed (odd powers of z).  */
-  svfloat64_t az = svabs_x (pg, z);
-  az = svneg_m (az, red, az);
+  svbool_t red = svacgt (pg, x, d->neg_one);
+  svfloat64_t z = svsel (red, svdiv_x (pg, sv_f64 (d->neg_one), x), x);
+
+  /* Reuse of -1.0f to reduce constant loads,
+     We need a shift value of 1/2, which is created via -1 + (1 + 1/2).  */
+  svfloat64_t shift
+      = svadd_z (red, sv_f64 (d->neg_one), sv_f64 (d->shift_val));
+
+  /* Reinserts the sign bit of the argument to handle the case of x < -1.  */
+  shift = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (shift), sign));
 
   /* Use split Estrin scheme for P(z^2) with deg(P)=19.  */
-  svfloat64_t z2 = svmul_x (pg, z, z);
-  svfloat64_t x2 = svmul_x (pg, z2, z2);
-  svfloat64_t x4 = svmul_x (pg, x2, x2);
-  svfloat64_t x8 = svmul_x (pg, x4, x4);
+  svfloat64_t z2 = svmul_x (ptrue, z, z);
+  svfloat64_t z4 = svmul_x (ptrue, z2, z2);
+  svfloat64_t z8 = svmul_x (ptrue, z4, z4);
+  svfloat64_t z16 = svmul_x (ptrue, z8, z8);
 
-  svfloat64_t y
-      = svmla_x (pg, sv_estrin_7_f64_x (pg, z2, x2, x4, d->poly),
-		 sv_estrin_11_f64_x (pg, z2, x2, x4, x8, d->poly + 8), x8);
+  /* Order-7 Estrin.  */
+  svfloat64_t c13 = svld1rq (ptrue, &d->c1);
+  svfloat64_t c57 = svld1rq (ptrue, &d->c5);
 
-  /* y = shift + z + z^3 * P(z^2).  */
-  svfloat64_t z3 = svmul_x (pg, z2, az);
-  y = svmla_x (pg, az, z3, y);
+  svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), z2, c13, 0);
+  svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), z2, c13, 1);
+  svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), z2, c57, 0);
+  svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), z2, c57, 1);
+
+  svfloat64_t p03 = svmla_x (pg, p01, z4, p23);
+  svfloat64_t p47 = svmla_x (pg, p45, z4, p67);
+  svfloat64_t p07 = svmla_x (pg, p03, z8, p47);
+
+  /* Order-11 Estrin.  */
+  svfloat64_t c911 = svld1rq (ptrue, &d->c9);
+  svfloat64_t c1315 = svld1rq (ptrue, &d->c13);
+  svfloat64_t c1719 = svld1rq (ptrue, &d->c17);
 
-  /* Apply shift as indicated by `red` predicate.  */
-  y = svadd_m (red, y, d->pi_over_2);
+  svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), z2, c911, 0);
+  svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), z2, c911, 1);
+  svfloat64_t p811 = svmla_x (pg, p89, z4, p1011);
 
-  /* y = atan(x) if x>0, -atan(-x) otherwise.  */
-  y = svreinterpret_f64 (sveor_x (pg, svreinterpret_u64 (y), sign));
+  svfloat64_t p1213 = svmla_lane (sv_f64 (d->c12), z2, c1315, 0);
+  svfloat64_t p1415 = svmla_lane (sv_f64 (d->c14), z2, c1315, 1);
+  svfloat64_t p1215 = svmla_x (pg, p1213, z4, p1415);
 
-  return y;
+  svfloat64_t p1617 = svmla_lane (sv_f64 (d->c16), z2, c1719, 0);
+  svfloat64_t p1819 = svmla_lane (sv_f64 (d->c18), z2, c1719, 1);
+  svfloat64_t p1619 = svmla_x (pg, p1617, z4, p1819);
+
+  svfloat64_t p815 = svmla_x (pg, p811, z8, p1215);
+  svfloat64_t p819 = svmla_x (pg, p815, z16, p1619);
+
+  svfloat64_t y = svmla_x (pg, p07, z16, p819);
+
+  /* y = shift + z + z^3 * P(z^2).  */
+  shift = svadd_m (red, z, shift);
+  y = svmul_x (pg, z2, y);
+  return svmla_x (pg, shift, z, y);
 }
diff --git a/sysdeps/aarch64/fpu/atanf_advsimd.c b/sysdeps/aarch64/fpu/atanf_advsimd.c
index 472865ed7..817a47ef3 100644
--- a/sysdeps/aarch64/fpu/atanf_advsimd.c
+++ b/sysdeps/aarch64/fpu/atanf_advsimd.c
@@ -22,26 +22,35 @@
 
 static const struct data
 {
+  uint32x4_t sign_mask, pi_over_2;
+  float32x4_t neg_one;
+#if WANT_SIMD_EXCEPT
   float32x4_t poly[8];
-  float32x4_t pi_over_2;
+} data = {
+  .poly = { V4 (-0x1.5554dcp-2), V4 (0x1.9978ecp-3), V4 (-0x1.230a94p-3),
+	    V4 (0x1.b4debp-4), V4 (-0x1.3550dap-4), V4 (0x1.61eebp-5),
+	    V4 (-0x1.0c17d4p-6), V4 (0x1.7ea694p-9) },
+#else
+  float32x4_t c0, c2, c4, c6;
+  float c1, c3, c5, c7;
 } data = {
   /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
      [2**-128, 1.0].
      Generated using fpminimax between FLT_MIN and 1.  */
-  .poly = { V4 (-0x1.55555p-2f), V4 (0x1.99935ep-3f), V4 (-0x1.24051ep-3f),
-	    V4 (0x1.bd7368p-4f), V4 (-0x1.491f0ep-4f), V4 (0x1.93a2c0p-5f),
-	    V4 (-0x1.4c3c60p-6f), V4 (0x1.01fd88p-8f) },
-  .pi_over_2 = V4 (0x1.921fb6p+0f),
+  .c0 = V4 (-0x1.5554dcp-2),	.c1 = 0x1.9978ecp-3,
+  .c2 = V4 (-0x1.230a94p-3),	.c3 = 0x1.b4debp-4,
+  .c4 = V4 (-0x1.3550dap-4),	.c5 = 0x1.61eebp-5,
+  .c6 = V4 (-0x1.0c17d4p-6),	.c7 = 0x1.7ea694p-9,
+#endif
+  .pi_over_2 = V4 (0x3fc90fdb),
+  .neg_one = V4 (-1.0f),
+  .sign_mask = V4 (0x80000000),
 };
 
-#define SignMask v_u32 (0x80000000)
-
-#define P(i) d->poly[i]
-
+#if WANT_SIMD_EXCEPT
 #define TinyBound 0x30800000 /* asuint(0x1p-30).  */
 #define BigBound 0x4e800000  /* asuint(0x1p30).  */
 
-#if WANT_SIMD_EXCEPT
 static float32x4_t VPCS_ATTR NOINLINE
 special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
 {
@@ -51,19 +60,20 @@ special_case (float32x4_t x, float32x4_t y, uint32x4_t special)
 
 /* Fast implementation of vector atanf based on
    atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1]
-   using z=-1/x and shift = pi/2. Maximum observed error is 2.9ulps:
-   _ZGVnN4v_atanf (0x1.0468f6p+0) got 0x1.967f06p-1 want 0x1.967fp-1.  */
+   using z=-1/x and shift = pi/2. Maximum observed error is 2.02 ulps:
+   _ZGVnN4v_atanf (0x1.03d4cep+0) got 0x1.95ed3ap-1
+				 want 0x1.95ed36p-1.  */
 float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atan) (float32x4_t x)
 {
   const struct data *d = ptr_barrier (&data);
 
-  /* Small cases, infs and nans are supported by our approximation technique,
-     but do not set fenv flags correctly. Only trigger special case if we need
-     fenv.  */
   uint32x4_t ix = vreinterpretq_u32_f32 (x);
-  uint32x4_t sign = vandq_u32 (ix, SignMask);
+  uint32x4_t sign = vandq_u32 (ix, d->sign_mask);
 
 #if WANT_SIMD_EXCEPT
+  /* Small cases, infs and nans are supported by our approximation technique,
+     but do not set fenv flags correctly. Only trigger special case if we need
+     fenv.  */
   uint32x4_t ia = vandq_u32 (ix, v_u32 (0x7ff00000));
   uint32x4_t special = vcgtq_u32 (vsubq_u32 (ia, v_u32 (TinyBound)),
 				  v_u32 (BigBound - TinyBound));
@@ -71,41 +81,52 @@ float32x4_t VPCS_ATTR NOINLINE V_NAME_F1 (atan) (float32x4_t x)
   if (__glibc_unlikely (v_any_u32 (special)))
     return special_case (x, x, v_u32 (-1));
 #endif
-
   /* Argument reduction:
-     y := arctan(x) for x < 1
-     y := pi/2 + arctan(-1/x) for x > 1
-     Hence, use z=-1/a if x>=1, otherwise z=a.  */
-  uint32x4_t red = vcagtq_f32 (x, v_f32 (1.0));
-  /* Avoid dependency in abs(x) in division (and comparison).  */
-  float32x4_t z = vbslq_f32 (red, vdivq_f32 (v_f32 (1.0f), x), x);
+     y := arctan(x) for |x| < 1
+     y := arctan(-1/x) + pi/2 for x > +1
+     y := arctan(-1/x) - pi/2 for x < -1
+     Hence, use z=-1/a if x>=|-1|, otherwise z=a.  */
+  uint32x4_t red = vcagtq_f32 (x, d->neg_one);
+
+  float32x4_t z = vbslq_f32 (red, vdivq_f32 (d->neg_one, x), x);
+
+  /* Shift is calculated as +-pi/2 or 0, depending on the argument case.  */
   float32x4_t shift = vreinterpretq_f32_u32 (
-      vandq_u32 (red, vreinterpretq_u32_f32 (d->pi_over_2)));
-  /* Use absolute value only when needed (odd powers of z).  */
-  float32x4_t az = vbslq_f32 (
-      SignMask, vreinterpretq_f32_u32 (vandq_u32 (SignMask, red)), z);
+      vandq_u32 (red, veorq_u32 (d->pi_over_2, sign)));
+
+  float32x4_t z2 = vmulq_f32 (z, z);
+  float32x4_t z3 = vmulq_f32 (z, z2);
+  float32x4_t z4 = vmulq_f32 (z2, z2);
+#if WANT_SIMD_EXCEPT
 
   /* Calculate the polynomial approximation.
      Use 2-level Estrin scheme for P(z^2) with deg(P)=7. However,
      a standard implementation using z8 creates spurious underflow
      in the very last fma (when z^8 is small enough).
-     Therefore, we split the last fma into a mul and an fma.
-     Horner and single-level Estrin have higher errors that exceed
-     threshold.  */
-  float32x4_t z2 = vmulq_f32 (z, z);
-  float32x4_t z4 = vmulq_f32 (z2, z2);
-
+     Therefore, we split the last fma into a mul and an fma.  */
   float32x4_t y = vfmaq_f32 (
       v_pairwise_poly_3_f32 (z2, z4, d->poly), z4,
       vmulq_f32 (z4, v_pairwise_poly_3_f32 (z2, z4, d->poly + 4)));
 
-  /* y = shift + z * P(z^2).  */
-  y = vaddq_f32 (vfmaq_f32 (az, y, vmulq_f32 (z2, az)), shift);
+#else
+  float32x4_t z8 = vmulq_f32 (z4, z4);
+
+  /* Uses an Estrin scheme for polynomial approximation.  */
+  float32x4_t odd_coeffs = vld1q_f32 (&d->c1);
+
+  float32x4_t p01 = vfmaq_laneq_f32 (d->c0, z2, odd_coeffs, 0);
+  float32x4_t p23 = vfmaq_laneq_f32 (d->c2, z2, odd_coeffs, 1);
+  float32x4_t p45 = vfmaq_laneq_f32 (d->c4, z2, odd_coeffs, 2);
+  float32x4_t p67 = vfmaq_laneq_f32 (d->c6, z2, odd_coeffs, 3);
 
-  /* y = atan(x) if x>0, -atan(-x) otherwise.  */
-  y = vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 (y), sign));
+  float32x4_t p03 = vfmaq_f32 (p01, z4, p23);
+  float32x4_t p47 = vfmaq_f32 (p45, z4, p67);
 
-  return y;
+  float32x4_t y = vfmaq_f32 (p03, z8, p47);
+#endif
+
+  /* y = shift + z * P(z^2).  */
+  return vfmaq_f32 (vaddq_f32 (shift, z), z3, y);
 }
 libmvec_hidden_def (V_NAME_F1 (atan))
 HALF_WIDTH_ALIAS_F1 (atan)
diff --git a/sysdeps/aarch64/fpu/atanf_sve.c b/sysdeps/aarch64/fpu/atanf_sve.c
index 3a98d70c5..6558223e4 100644
--- a/sysdeps/aarch64/fpu/atanf_sve.c
+++ b/sysdeps/aarch64/fpu/atanf_sve.c
@@ -18,18 +18,26 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include "sv_math.h"
-#include "poly_sve_f32.h"
 
 static const struct data
 {
-  float32_t poly[8];
-  float32_t pi_over_2;
+  float32_t c1, c3, c5, c7;
+  float32_t c0, c2, c4, c6;
+  float32_t shift_val, neg_one;
 } data = {
   /* Coefficients of polynomial P such that atan(x)~x+x*P(x^2) on
     [2**-128, 1.0].  */
-  .poly = { -0x1.55555p-2f, 0x1.99935ep-3f, -0x1.24051ep-3f, 0x1.bd7368p-4f,
-	    -0x1.491f0ep-4f, 0x1.93a2c0p-5f, -0x1.4c3c60p-6f, 0x1.01fd88p-8f },
-  .pi_over_2 = 0x1.921fb6p+0f,
+  .c0 = -0x1.5554dcp-2,
+  .c1 = 0x1.9978ecp-3,
+  .c2 = -0x1.230a94p-3,
+  .c3 = 0x1.b4debp-4,
+  .c4 = -0x1.3550dap-4,
+  .c5 = 0x1.61eebp-5,
+  .c6 = -0x1.0c17d4p-6,
+  .c7 = 0x1.7ea694p-9,
+  /*  pi/2, used as a shift value after reduction.  */
+  .shift_val = 0x1.921fb54442d18p+0,
+  .neg_one = -1.0f,
 };
 
 #define SignMask (0x80000000)
@@ -37,43 +45,49 @@ static const struct data
 /* Fast implementation of SVE atanf based on
    atan(x) ~ shift + z + z^3 * P(z^2) with reduction to [0,1] using
    z=-1/x and shift = pi/2.
-   Largest observed error is 2.9 ULP, close to +/-1.0:
-   _ZGVsMxv_atanf (0x1.0468f6p+0) got -0x1.967f06p-1
-				 want -0x1.967fp-1.  */
+   Largest observed error is 2.12 ULP:
+   _ZGVsMxv_atanf (0x1.03d4cep+0) got 0x1.95ed3ap-1
+				 want 0x1.95ed36p-1.  */
 svfloat32_t SV_NAME_F1 (atan) (svfloat32_t x, const svbool_t pg)
 {
   const struct data *d = ptr_barrier (&data);
+  svbool_t ptrue = svptrue_b32 ();
 
   /* No need to trigger special case. Small cases, infs and nans
      are supported by our approximation technique.  */
   svuint32_t ix = svreinterpret_u32 (x);
-  svuint32_t sign = svand_x (pg, ix, SignMask);
+  svuint32_t sign = svand_x (ptrue, ix, SignMask);
 
   /* Argument reduction:
      y := arctan(x) for x < 1
-     y := pi/2 + arctan(-1/x) for x > 1
-     Hence, use z=-1/a if x>=1, otherwise z=a.  */
-  svbool_t red = svacgt (pg, x, 1.0f);
-  /* Avoid dependency in abs(x) in division (and comparison).  */
-  svfloat32_t z = svsel (red, svdiv_x (pg, sv_f32 (1.0f), x), x);
-  /* Use absolute value only when needed (odd powers of z).  */
-  svfloat32_t az = svabs_x (pg, z);
-  az = svneg_m (az, red, az);
-
-  /* Use split Estrin scheme for P(z^2) with deg(P)=7.  */
-  svfloat32_t z2 = svmul_x (pg, z, z);
-  svfloat32_t z4 = svmul_x (pg, z2, z2);
-  svfloat32_t z8 = svmul_x (pg, z4, z4);
-
-  svfloat32_t y = sv_estrin_7_f32_x (pg, z2, z4, z8, d->poly);
-
-  /* y = shift + z + z^3 * P(z^2).  */
-  svfloat32_t z3 = svmul_x (pg, z2, az);
-  y = svmla_x (pg, az, z3, y);
-
-  /* Apply shift as indicated by 'red' predicate.  */
-  y = svadd_m (red, y, sv_f32 (d->pi_over_2));
-
-  /* y = atan(x) if x>0, -atan(-x) otherwise.  */
-  return svreinterpret_f32 (sveor_x (pg, svreinterpret_u32 (y), sign));
+     y := arctan(-1/x) + pi/2 for x > +1
+     y := arctan(-1/x) - pi/2 for x < -1
+     Hence, use z=-1/a if |x|>=|-1|, otherwise z=a.  */
+  svbool_t red = svacgt (pg, x, d->neg_one);
+  svfloat32_t z = svsel (red, svdiv_x (pg, sv_f32 (d->neg_one), x), x);
+
+  /* Reinserts the sign bit of the argument to handle the case of x < -1.  */
+  svfloat32_t shift = svreinterpret_f32 (
+      sveor_x (red, svreinterpret_u32 (sv_f32 (d->shift_val)), sign));
+
+  svfloat32_t z2 = svmul_x (ptrue, z, z);
+  svfloat32_t z3 = svmul_x (ptrue, z2, z);
+  svfloat32_t z4 = svmul_x (ptrue, z2, z2);
+  svfloat32_t z8 = svmul_x (ptrue, z4, z4);
+
+  svfloat32_t odd_coeffs = svld1rq (ptrue, &d->c1);
+
+  svfloat32_t p01 = svmla_lane (sv_f32 (d->c0), z2, odd_coeffs, 0);
+  svfloat32_t p23 = svmla_lane (sv_f32 (d->c2), z2, odd_coeffs, 1);
+  svfloat32_t p45 = svmla_lane (sv_f32 (d->c4), z2, odd_coeffs, 2);
+  svfloat32_t p67 = svmla_lane (sv_f32 (d->c6), z2, odd_coeffs, 3);
+
+  svfloat32_t p03 = svmla_x (pg, p01, z4, p23);
+  svfloat32_t p47 = svmla_x (pg, p45, z4, p67);
+
+  svfloat32_t y = svmla_x (pg, p03, z8, p47);
+
+  /* shift + z + z^3 * P(z^2).  */
+  shift = svadd_m (red, z, shift);
+  return svmla_x (pg, shift, z3, y);
 }
diff --git a/sysdeps/aarch64/fpu/atanh_sve.c b/sysdeps/aarch64/fpu/atanh_sve.c
index 16a7cf6aa..958d69a5f 100644
--- a/sysdeps/aarch64/fpu/atanh_sve.c
+++ b/sysdeps/aarch64/fpu/atanh_sve.c
@@ -30,7 +30,7 @@ special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
 }
 
 /* SVE approximation for double-precision atanh, based on log1p.
-   The greatest observed error is 2.81 ULP:
+   The greatest observed error is 3.3 ULP:
    _ZGVsMxv_atanh(0x1.ffae6288b601p-6) got 0x1.ffd8ff31b5019p-6
 				      want 0x1.ffd8ff31b501cp-6.  */
 svfloat64_t SV_NAME_D1 (atanh) (svfloat64_t x, const svbool_t pg)
@@ -42,7 +42,6 @@ svfloat64_t SV_NAME_D1 (atanh) (svfloat64_t x, const svbool_t pg)
   svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, Half));
 
   /* It is special if iax >= 1.  */
-//   svbool_t special = svcmpge (pg, iax, One);
   svbool_t special = svacge (pg, x, 1.0);
 
   /* Computation is performed based on the following sequence of equality:
diff --git a/sysdeps/aarch64/fpu/cosh_sve.c b/sysdeps/aarch64/fpu/cosh_sve.c
index ca4405353..f5a163b05 100644
--- a/sysdeps/aarch64/fpu/cosh_sve.c
+++ b/sysdeps/aarch64/fpu/cosh_sve.c
@@ -21,69 +21,99 @@
 
 static const struct data
 {
-  float64_t poly[3];
-  float64_t inv_ln2, ln2_hi, ln2_lo, shift, thres;
-  uint64_t index_mask, special_bound;
+  double c0, c2;
+  double c1, c3;
+  float64_t inv_ln2, ln2_hi, ln2_lo, shift;
+  uint64_t special_bound;
 } data = {
-  .poly = { 0x1.fffffffffffd4p-2, 0x1.5555571d6b68cp-3,
-	    0x1.5555576a59599p-5, },
-
-  .inv_ln2 = 0x1.71547652b82fep8, /* N/ln2.  */
-  /* -ln2/N.  */
-  .ln2_hi = -0x1.62e42fefa39efp-9,
-  .ln2_lo = -0x1.abc9e3b39803f3p-64,
-  .shift = 0x1.8p+52,
-  .thres = 704.0,
-
-  .index_mask = 0xff,
-  /* 0x1.6p9, above which exp overflows.  */
-  .special_bound = 0x4086000000000000,
+  /* Generated using Remez, in [-log(2)/128, log(2)/128].  */
+  .c0 = 0x1.fffffffffdbcdp-2,
+  .c1 = 0x1.555555555444cp-3,
+  .c2 = 0x1.555573c6a9f7dp-5,
+  .c3 = 0x1.1111266d28935p-7,
+  .ln2_hi = 0x1.62e42fefa3800p-1,
+  .ln2_lo = 0x1.ef35793c76730p-45,
+  /* 1/ln2.  */
+  .inv_ln2 = 0x1.71547652b82fep+0,
+  .shift = 0x1.800000000ff80p+46, /* 1.5*2^46+1022.  */
+
+  /* asuint(ln(2^(1024 - 1/128))), the value above which exp overflows.  */
+  .special_bound = 0x40862e37e7d8ba72,
 };
 
-static svfloat64_t NOINLINE
-special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
-{
-  return sv_call_f64 (cosh, x, y, special);
-}
-
-/* Helper for approximating exp(x). Copied from sv_exp_tail, with no
-   special-case handling or tail.  */
+/* Helper for approximating exp(x)/2.
+   Functionally identical to FEXPA exp(x), but an adjustment in
+   the shift value which leads to a reduction in the exponent of scale by 1,
+   thus halving the result at no cost.  */
 static inline svfloat64_t
-exp_inline (svfloat64_t x, const svbool_t pg, const struct data *d)
+exp_over_two_inline (const svbool_t pg, svfloat64_t x, const struct data *d)
 {
   /* Calculate exp(x).  */
   svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2);
+  svuint64_t u = svreinterpret_u64 (z);
   svfloat64_t n = svsub_x (pg, z, d->shift);
 
-  svfloat64_t r = svmla_x (pg, x, n, d->ln2_hi);
-  r = svmla_x (pg, r, n, d->ln2_lo);
+  svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
+  svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
 
-  svuint64_t u = svreinterpret_u64 (z);
-  svuint64_t e = svlsl_x (pg, u, 52 - V_EXP_TAIL_TABLE_BITS);
-  svuint64_t i = svand_x (pg, u, d->index_mask);
+  svfloat64_t r = x;
+  r = svmls_lane (r, n, ln2, 0);
+  r = svmls_lane (r, n, ln2, 1);
 
-  svfloat64_t y = svmla_x (pg, sv_f64 (d->poly[1]), r, d->poly[2]);
-  y = svmla_x (pg, sv_f64 (d->poly[0]), r, y);
-  y = svmla_x (pg, sv_f64 (1.0), r, y);
-  y = svmul_x (pg, r, y);
+  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+  svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0);
+  svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1);
+  svfloat64_t p04 = svmla_x (pg, p01, p23, r2);
+  svfloat64_t p = svmla_x (pg, r, p04, r2);
 
-  /* s = 2^(n/N).  */
-  u = svld1_gather_index (pg, __v_exp_tail_data, i);
-  svfloat64_t s = svreinterpret_f64 (svadd_x (pg, u, e));
+  svfloat64_t scale = svexpa (u);
 
-  return svmla_x (pg, s, s, y);
+  return svmla_x (pg, scale, scale, p);
+}
+
+/* Vectorised special case to handle values past where exp_inline overflows.
+   Halves the input value and uses the identity exp(x) = exp(x/2)^2 to double
+   the valid range of inputs, and returns inf for anything past that.  */
+static svfloat64_t NOINLINE
+special_case (svbool_t pg, svbool_t special, svfloat64_t ax, svfloat64_t t,
+	      const struct data *d)
+{
+  /* Finish fast path to compute values for non-special cases.  */
+  svfloat64_t inv_twoexp = svdivr_x (pg, t, 0.25);
+  svfloat64_t y = svadd_x (pg, t, inv_twoexp);
+
+  /* Halves input value, and then check if any cases
+     are still going to overflow.  */
+  ax = svmul_x (special, ax, 0.5);
+  svbool_t is_safe
+      = svcmplt (special, svreinterpret_u64 (ax), d->special_bound);
+
+  /* Computes exp(x/2), and sets any overflowing lanes to inf.  */
+  svfloat64_t half_exp = exp_over_two_inline (special, ax, d);
+  half_exp = svsel (is_safe, half_exp, sv_f64 (INFINITY));
+
+  /* Construct special case cosh(x) = (exp(x/2)^2)/2.  */
+  svfloat64_t exp = svmul_x (svptrue_b64 (), half_exp, 2);
+  svfloat64_t special_y = svmul_x (special, exp, half_exp);
+
+  /* Select correct return values for special and non-special cases.  */
+  special_y = svsel (special, special_y, y);
+
+  /* Ensure an input of nan is correctly propagated.  */
+  svbool_t is_nan
+      = svcmpgt (special, svreinterpret_u64 (ax), sv_u64 (0x7ff0000000000000));
+  return svsel (is_nan, ax, svsel (special, special_y, y));
 }
 
 /* Approximation for SVE double-precision cosh(x) using exp_inline.
    cosh(x) = (exp(x) + exp(-x)) / 2.
-   The greatest observed error is in the scalar fall-back region, so is the
-   same as the scalar routine, 1.93 ULP:
-   _ZGVsMxv_cosh (0x1.628ad45039d2fp+9) got 0x1.fd774e958236dp+1021
-				       want 0x1.fd774e958236fp+1021.
-
-   The greatest observed error in the non-special region is 1.54 ULP:
-   _ZGVsMxv_cosh (0x1.ba5651dd4486bp+2) got 0x1.f5e2bb8d5c98fp+8
-				       want 0x1.f5e2bb8d5c991p+8.  */
+   The greatest observed error in special case region is 2.66 + 0.5 ULP:
+   _ZGVsMxv_cosh (0x1.633b532ffbc1ap+9) got 0x1.f9b2d3d22399ep+1023
+				       want 0x1.f9b2d3d22399bp+1023
+
+  The greatest observed error in the non-special region is 1.01 + 0.5 ULP:
+  _ZGVsMxv_cosh (0x1.998ecbb3c1f81p+1) got 0x1.890b225657f84p+3
+				      want 0x1.890b225657f82p+3.  */
 svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg)
 {
   const struct data *d = ptr_barrier (&data);
@@ -92,14 +122,13 @@ svfloat64_t SV_NAME_D1 (cosh) (svfloat64_t x, const svbool_t pg)
   svbool_t special = svcmpgt (pg, svreinterpret_u64 (ax), d->special_bound);
 
   /* Up to the point that exp overflows, we can use it to calculate cosh by
-     exp(|x|) / 2 + 1 / (2 * exp(|x|)).  */
-  svfloat64_t t = exp_inline (ax, pg, d);
-  svfloat64_t half_t = svmul_x (pg, t, 0.5);
-  svfloat64_t half_over_t = svdivr_x (pg, t, 0.5);
+     (exp(|x|)/2 + 1) / (2 * exp(|x|)).  */
+  svfloat64_t half_exp = exp_over_two_inline (pg, ax, d);
 
-  /* Fall back to scalar for any special cases.  */
+  /* Falls back to entirely standalone vectorized special case.  */
   if (__glibc_unlikely (svptest_any (pg, special)))
-    return special_case (x, svadd_x (pg, half_t, half_over_t), special);
+    return special_case (pg, special, ax, half_exp, d);
 
-  return svadd_x (pg, half_t, half_over_t);
+  svfloat64_t inv_twoexp = svdivr_x (pg, half_exp, 0.25);
+  return svadd_x (pg, half_exp, inv_twoexp);
 }
diff --git a/sysdeps/aarch64/fpu/coshf_sve.c b/sysdeps/aarch64/fpu/coshf_sve.c
index fb8e06cf7..805605541 100644
--- a/sysdeps/aarch64/fpu/coshf_sve.c
+++ b/sysdeps/aarch64/fpu/coshf_sve.c
@@ -39,9 +39,9 @@ special_case (svfloat32_t x, svfloat32_t half_e, svfloat32_t half_over_e,
 }
 
 /* Single-precision vector cosh, using vector expf.
-   Maximum error is 2.77 ULP:
-   _ZGVsMxv_coshf(-0x1.5b38f4p+1) got 0x1.e45946p+2
-				 want 0x1.e4594cp+2.  */
+   Maximum error is 2.56 +0.5 ULP:
+   _ZGVsMxv_coshf(-0x1.5b40f4p+1) got 0x1.e47748p+2
+				 want 0x1.e4774ep+2.  */
 svfloat32_t SV_NAME_F1 (cosh) (svfloat32_t x, svbool_t pg)
 {
   const struct data *d = ptr_barrier (&data);
diff --git a/sysdeps/aarch64/fpu/erfcf_sve.c b/sysdeps/aarch64/fpu/erfcf_sve.c
index 2743f9dbb..b57ab514b 100644
--- a/sysdeps/aarch64/fpu/erfcf_sve.c
+++ b/sysdeps/aarch64/fpu/erfcf_sve.c
@@ -76,7 +76,7 @@ svfloat32_t SV_NAME_F1 (erfc) (svfloat32_t x, const svbool_t pg)
   svuint32_t i = svqadd (svreinterpret_u32 (z), dat->off_idx);
 
   /* Lookup erfc(r) and 2/sqrt(pi)*exp(-r^2) in tables.  */
-  i = svmul_x (pg, i, 2);
+  i = svlsl_x (svptrue_b32 (), i, 1);
   const float32_t *p = &__v_erfcf_data.tab[0].erfc - 2 * dat->off_arr;
   svfloat32_t erfcr = svld1_gather_index (pg, p, i);
   svfloat32_t scale = svld1_gather_index (pg, p + 1, i);
@@ -84,15 +84,15 @@ svfloat32_t SV_NAME_F1 (erfc) (svfloat32_t x, const svbool_t pg)
   /* erfc(x) ~ erfc(r) - scale * d * poly(r, d).  */
   svfloat32_t r = svsub_x (pg, z, shift);
   svfloat32_t d = svsub_x (pg, a, r);
-  svfloat32_t d2 = svmul_x (pg, d, d);
-  svfloat32_t r2 = svmul_x (pg, r, r);
+  svfloat32_t d2 = svmul_x (svptrue_b32 (), d, d);
+  svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
 
   svfloat32_t coeffs = svld1rq (svptrue_b32 (), &dat->third);
-  svfloat32_t third = svdup_lane (coeffs, 0);
 
   svfloat32_t p1 = r;
-  svfloat32_t p2 = svmls_lane (third, r2, coeffs, 1);
-  svfloat32_t p3 = svmul_x (pg, r, svmla_lane (sv_f32 (-0.5), r2, coeffs, 0));
+  svfloat32_t p2 = svmls_lane (sv_f32 (dat->third), r2, coeffs, 1);
+  svfloat32_t p3
+      = svmul_x (svptrue_b32 (), r, svmla_lane (sv_f32 (-0.5), r2, coeffs, 0));
   svfloat32_t p4 = svmla_lane (sv_f32 (dat->two_over_five), r2, coeffs, 2);
   p4 = svmls_x (pg, sv_f32 (dat->tenth), r2, p4);
 
diff --git a/sysdeps/aarch64/fpu/exp10_sve.c b/sysdeps/aarch64/fpu/exp10_sve.c
index f71bafdf0..53b28934d 100644
--- a/sysdeps/aarch64/fpu/exp10_sve.c
+++ b/sysdeps/aarch64/fpu/exp10_sve.c
@@ -18,21 +18,23 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include "sv_math.h"
-#include "poly_sve_f64.h"
 
 #define SpecialBound 307.0 /* floor (log10 (2^1023)).  */
 
 static const struct data
 {
-  double poly[5];
+  double c1, c3, c2, c4, c0;
   double shift, log10_2, log2_10_hi, log2_10_lo, scale_thres, special_bound;
 } data = {
   /* Coefficients generated using Remez algorithm.
      rel error: 0x1.9fcb9b3p-60
      abs error: 0x1.a20d9598p-60 in [ -log10(2)/128, log10(2)/128 ]
      max ulp err 0.52 +0.5.  */
-  .poly = { 0x1.26bb1bbb55516p1, 0x1.53524c73cd32ap1, 0x1.0470591daeafbp1,
-	    0x1.2bd77b1361ef6p0, 0x1.142b5d54e9621p-1 },
+  .c0 = 0x1.26bb1bbb55516p1,
+  .c1 = 0x1.53524c73cd32ap1,
+  .c2 = 0x1.0470591daeafbp1,
+  .c3 = 0x1.2bd77b1361ef6p0,
+  .c4 = 0x1.142b5d54e9621p-1,
   /* 1.5*2^46+1023. This value is further explained below.  */
   .shift = 0x1.800000000ffc0p+46,
   .log10_2 = 0x1.a934f0979a371p1,     /* 1/log2(10).  */
@@ -70,9 +72,9 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n,
   /* |n| > 1280 => 2^(n) overflows.  */
   svbool_t p_cmp = svacgt (pg, n, d->scale_thres);
 
-  svfloat64_t r1 = svmul_x (pg, s1, s1);
+  svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
   svfloat64_t r2 = svmla_x (pg, s2, s2, y);
-  svfloat64_t r0 = svmul_x (pg, r2, s1);
+  svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
 
   return svsel (p_cmp, r1, r0);
 }
@@ -103,11 +105,14 @@ svfloat64_t SV_NAME_D1 (exp10) (svfloat64_t x, svbool_t pg)
      comes at significant performance cost.  */
   svuint64_t u = svreinterpret_u64 (z);
   svfloat64_t scale = svexpa (u);
-
+  svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2);
   /* Approximate exp10(r) using polynomial.  */
-  svfloat64_t r2 = svmul_x (pg, r, r);
-  svfloat64_t y = svmla_x (pg, svmul_x (pg, r, d->poly[0]), r2,
-			   sv_pairwise_poly_3_f64_x (pg, r, r2, d->poly + 1));
+  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+  svfloat64_t p12 = svmla_lane (sv_f64 (d->c1), r, c24, 0);
+  svfloat64_t p34 = svmla_lane (sv_f64 (d->c3), r, c24, 1);
+  svfloat64_t p14 = svmla_x (pg, p12, p34, r2);
+
+  svfloat64_t y = svmla_x (pg, svmul_x (svptrue_b64 (), r, d->c0), r2, p14);
 
   /* Assemble result as exp10(x) = 2^n * exp10(r).  If |x| > SpecialBound
      multiplication may overflow, so use special case routine.  */
diff --git a/sysdeps/aarch64/fpu/exp10f_sve.c b/sysdeps/aarch64/fpu/exp10f_sve.c
index 1a74db265..f3e7f8b4f 100644
--- a/sysdeps/aarch64/fpu/exp10f_sve.c
+++ b/sysdeps/aarch64/fpu/exp10f_sve.c
@@ -19,26 +19,19 @@
 
 #include "sv_math.h"
 
-/* For x < -Thres, the result is subnormal and not handled correctly by
-   FEXPA.  */
-#define Thres 37.9
+/* For x < -Thres (-log10(2^126)), the result is subnormal and not handled
+   correctly by FEXPA.  */
+#define Thres 0x1.2f702p+5
 
 static const struct data
 {
-  float log2_10_lo, c0, c2, c4;
-  float c1, c3, log10_2;
-  float shift, log2_10_hi, thres;
+  float log10_2, log2_10_hi, log2_10_lo, c1;
+  float c0, shift, thres;
 } data = {
   /* Coefficients generated using Remez algorithm with minimisation of relative
-     error.
-     rel error: 0x1.89dafa3p-24
-     abs error: 0x1.167d55p-23 in [-log10(2)/2, log10(2)/2]
-     maxerr: 0.52 +0.5 ulp.  */
-  .c0 = 0x1.26bb16p+1f,
-  .c1 = 0x1.5350d2p+1f,
-  .c2 = 0x1.04744ap+1f,
-  .c3 = 0x1.2d8176p+0f,
-  .c4 = 0x1.12b41ap-1f,
+     error.  */
+  .c0 = 0x1.26bb62p1,
+  .c1 = 0x1.53524cp1,
   /* 1.5*2^17 + 127, a shift value suitable for FEXPA.  */
   .shift = 0x1.803f8p17f,
   .log10_2 = 0x1.a934fp+1,
@@ -53,28 +46,23 @@ sv_exp10f_inline (svfloat32_t x, const svbool_t pg, const struct data *d)
   /* exp10(x) = 2^(n/N) * 10^r = 2^n * (1 + poly (r)),
      with poly(r) in [1/sqrt(2), sqrt(2)] and
      x = r + n * log10(2) / N, with r in [-log10(2)/2N, log10(2)/2N].  */
-
-  svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->log2_10_lo);
+  svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->log10_2);
 
   /* n = round(x/(log10(2)/N)).  */
   svfloat32_t shift = sv_f32 (d->shift);
-  svfloat32_t z = svmad_x (pg, sv_f32 (d->log10_2), x, shift);
-  svfloat32_t n = svsub_x (svptrue_b32 (), z, shift);
+  svfloat32_t z = svmla_lane (shift, x, lane_consts, 0);
+  svfloat32_t n = svsub_x (pg, z, shift);
 
   /* r = x - n*log10(2)/N.  */
-  svfloat32_t r = svmsb_x (pg, sv_f32 (d->log2_10_hi), n, x);
-  r = svmls_lane (r, n, lane_consts, 0);
+  svfloat32_t r = x;
+  r = svmls_lane (r, n, lane_consts, 1);
+  r = svmls_lane (r, n, lane_consts, 2);
 
   svfloat32_t scale = svexpa (svreinterpret_u32 (z));
 
   /* Polynomial evaluation: poly(r) ~ exp10(r)-1.  */
-  svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2);
-  svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3);
-  svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
-  svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
-  svfloat32_t p0 = svmul_lane (r, lane_consts, 1);
-  svfloat32_t poly = svmla_x (pg, p0, r2, p14);
-
+  svfloat32_t poly = svmla_lane (sv_f32 (d->c0), r, lane_consts, 3);
+  poly = svmul_x (pg, poly, r);
   return svmla_x (pg, scale, scale, poly);
 }
 
@@ -85,11 +73,10 @@ special_case (svfloat32_t x, svbool_t special, const struct data *d)
 		      special);
 }
 
-/* Single-precision SVE exp10f routine. Implements the same algorithm
-   as AdvSIMD exp10f.
-   Worst case error is 1.02 ULPs.
-   _ZGVsMxv_exp10f(-0x1.040488p-4) got 0x1.ba5f9ep-1
-				  want 0x1.ba5f9cp-1.  */
+/* Single-precision SVE exp10f routine. Based on the FEXPA instruction.
+   Worst case error is 1.10 ULP.
+   _ZGVsMxv_exp10f (0x1.cc76dep+3) got 0x1.be0172p+47
+				  want 0x1.be017p+47.  */
 svfloat32_t SV_NAME_F1 (exp10) (svfloat32_t x, const svbool_t pg)
 {
   const struct data *d = ptr_barrier (&data);
diff --git a/sysdeps/aarch64/fpu/exp2_sve.c b/sysdeps/aarch64/fpu/exp2_sve.c
index a37c33092..c13585253 100644
--- a/sysdeps/aarch64/fpu/exp2_sve.c
+++ b/sysdeps/aarch64/fpu/exp2_sve.c
@@ -18,25 +18,22 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include "sv_math.h"
-#include "poly_sve_f64.h"
-
-#define N (1 << V_EXP_TABLE_BITS)
 
 #define BigBound 1022
 #define UOFlowBound 1280
 
 static const struct data
 {
-  double poly[4];
+  double c2, c4;
+  double c0, c1, c3;
   double shift, big_bound, uoflow_bound;
 } data = {
   /* Coefficients are computed using Remez algorithm with
      minimisation of the absolute error.  */
-  .poly = { 0x1.62e42fefa3686p-1, 0x1.ebfbdff82c241p-3, 0x1.c6b09b16de99ap-5,
-	    0x1.3b2abf5571ad8p-7 },
-  .shift = 0x1.8p52 / N,
-  .uoflow_bound = UOFlowBound,
-  .big_bound = BigBound,
+  .c0 = 0x1.62e42fefa39efp-1,  .c1 = 0x1.ebfbdff82a31bp-3,
+  .c2 = 0x1.c6b08d706c8a5p-5,  .c3 = 0x1.3b2ad2ff7d2f3p-7,
+  .c4 = 0x1.5d8761184beb3p-10, .shift = 0x1.800000000ffc0p+46,
+  .uoflow_bound = UOFlowBound, .big_bound = BigBound,
 };
 
 #define SpecialOffset 0x6000000000000000 /* 0x1p513.  */
@@ -65,47 +62,52 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n,
       svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b));
 
   /* |n| > 1280 => 2^(n) overflows.  */
-  svbool_t p_cmp = svacgt (pg, n, d->uoflow_bound);
+  svbool_t p_cmp = svacle (pg, n, d->uoflow_bound);
 
-  svfloat64_t r1 = svmul_x (pg, s1, s1);
+  svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
   svfloat64_t r2 = svmla_x (pg, s2, s2, y);
-  svfloat64_t r0 = svmul_x (pg, r2, s1);
+  svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
 
-  return svsel (p_cmp, r1, r0);
+  return svsel (p_cmp, r0, r1);
 }
 
 /* Fast vector implementation of exp2.
-   Maximum measured error is 1.65 ulp.
-   _ZGVsMxv_exp2(-0x1.4c264ab5b559bp-6) got 0x1.f8db0d4df721fp-1
-				       want 0x1.f8db0d4df721dp-1.  */
+   Maximum measured error is 0.52 + 0.5 ulp.
+   _ZGVsMxv_exp2 (0x1.3b72ad5b701bfp-1) got 0x1.8861641b49e08p+0
+				       want 0x1.8861641b49e07p+0.  */
 svfloat64_t SV_NAME_D1 (exp2) (svfloat64_t x, svbool_t pg)
 {
   const struct data *d = ptr_barrier (&data);
-  svbool_t no_big_scale = svacle (pg, x, d->big_bound);
-  svbool_t special = svnot_z (pg, no_big_scale);
-
-  /* Reduce x to k/N + r, where k is integer and r in [-1/2N, 1/2N].  */
-  svfloat64_t shift = sv_f64 (d->shift);
-  svfloat64_t kd = svadd_x (pg, x, shift);
-  svuint64_t ki = svreinterpret_u64 (kd);
-  /* kd = k/N.  */
-  kd = svsub_x (pg, kd, shift);
-  svfloat64_t r = svsub_x (pg, x, kd);
-
-  /* scale ~= 2^(k/N).  */
-  svuint64_t idx = svand_x (pg, ki, N - 1);
-  svuint64_t sbits = svld1_gather_index (pg, __v_exp_data, idx);
-  /* This is only a valid scale when -1023*N < k < 1024*N.  */
-  svuint64_t top = svlsl_x (pg, ki, 52 - V_EXP_TABLE_BITS);
-  svfloat64_t scale = svreinterpret_f64 (svadd_x (pg, sbits, top));
+  svbool_t special = svacge (pg, x, d->big_bound);
+
+  svfloat64_t z = svadd_x (svptrue_b64 (), x, d->shift);
+  svfloat64_t n = svsub_x (svptrue_b64 (), z, d->shift);
+  svfloat64_t r = svsub_x (svptrue_b64 (), x, n);
+
+  svfloat64_t scale = svexpa (svreinterpret_u64 (z));
+
+  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+  svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2);
 
   /* Approximate exp2(r) using polynomial.  */
-  svfloat64_t r2 = svmul_x (pg, r, r);
-  svfloat64_t p = sv_pairwise_poly_3_f64_x (pg, r, r2, d->poly);
-  svfloat64_t y = svmul_x (pg, r, p);
+  /* y = exp2(r) - 1 ~= r * (C0 + C1 r + C2 r^2 + C3 r^3 + C4 r^4).  */
+  svfloat64_t p12 = svmla_lane (sv_f64 (d->c1), r, c24, 0);
+  svfloat64_t p34 = svmla_lane (sv_f64 (d->c3), r, c24, 1);
+  svfloat64_t p = svmla_x (pg, p12, p34, r2);
+  p = svmad_x (pg, p, r, d->c0);
+  svfloat64_t y = svmul_x (svptrue_b64 (), r, p);
 
   /* Assemble exp2(x) = exp2(r) * scale.  */
   if (__glibc_unlikely (svptest_any (pg, special)))
-    return special_case (pg, scale, y, kd, d);
+    {
+      /* FEXPA zeroes the sign bit, however the sign is meaningful to the
+          special case function so needs to be copied.
+          e = sign bit of u << 46.  */
+      svuint64_t e = svand_x (pg, svlsl_x (pg, svreinterpret_u64 (z), 46),
+            0x8000000000000000);
+      scale = svreinterpret_f64 (svadd_x (pg, e, svreinterpret_u64 (scale)));
+      return special_case (pg, scale, y, n, d);
+    }
+
   return svmla_x (pg, scale, scale, y);
 }
diff --git a/sysdeps/aarch64/fpu/exp2f_sve.c b/sysdeps/aarch64/fpu/exp2f_sve.c
index fcd783016..989cefb60 100644
--- a/sysdeps/aarch64/fpu/exp2f_sve.c
+++ b/sysdeps/aarch64/fpu/exp2f_sve.c
@@ -18,21 +18,17 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include "sv_math.h"
-#include "poly_sve_f32.h"
 
 #define Thres 0x1.5d5e2ap+6f
 
 static const struct data
 {
-  float c0, c2, c4, c1, c3;
-  float shift, thres;
+  float c0, c1, shift, thres;
 } data = {
-  /* Coefficients copied from the polynomial in AdvSIMD variant.  */
-  .c0 = 0x1.62e422p-1f,
-  .c1 = 0x1.ebf9bcp-3f,
-  .c2 = 0x1.c6bd32p-5f,
-  .c3 = 0x1.3ce9e4p-7f,
-  .c4 = 0x1.59977ap-10f,
+  /* Coefficients generated using Remez algorithm with minimisation of relative
+     error.  */
+  .c0 = 0x1.62e485p-1,
+  .c1 = 0x1.ebfbe0p-3,
   /* 1.5*2^17 + 127.  */
   .shift = 0x1.803f8p17f,
   /* Roughly 87.3. For x < -Thres, the result is subnormal and not handled
@@ -51,16 +47,8 @@ sv_exp2f_inline (svfloat32_t x, const svbool_t pg, const struct data *d)
 
   svfloat32_t scale = svexpa (svreinterpret_u32 (z));
 
-  /* Polynomial evaluation: poly(r) ~ exp2(r)-1.
-     Evaluate polynomial use hybrid scheme - offset ESTRIN by 1 for
-     coefficients 1 to 4, and apply most significant coefficient directly.  */
-  svfloat32_t even_coeffs = svld1rq (svptrue_b32 (), &d->c0);
-  svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
-  svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, even_coeffs, 1);
-  svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, even_coeffs, 2);
-  svfloat32_t p14 = svmla_x (pg, p12, r2, p34);
-  svfloat32_t p0 = svmul_lane (r, even_coeffs, 0);
-  svfloat32_t poly = svmla_x (pg, p0, r2, p14);
+  svfloat32_t poly = svmla_x (pg, sv_f32 (d->c0), r, sv_f32 (d->c1));
+  poly = svmul_x (svptrue_b32 (), poly, r);
 
   return svmla_x (pg, scale, scale, poly);
 }
@@ -72,11 +60,10 @@ special_case (svfloat32_t x, svbool_t special, const struct data *d)
 		      special);
 }
 
-/* Single-precision SVE exp2f routine. Implements the same algorithm
-   as AdvSIMD exp2f.
-   Worst case error is 1.04 ULPs.
-   _ZGVsMxv_exp2f(-0x1.af994ap-3) got 0x1.ba6a66p-1
-				 want 0x1.ba6a64p-1.  */
+/* Single-precision SVE exp2f routine, based on the FEXPA instruction.
+   Worst case error is 1.09 ULPs.
+   _ZGVsMxv_exp2f (0x1.9a2a94p-1) got 0x1.be1054p+0
+				 want 0x1.be1052p+0.  */
 svfloat32_t SV_NAME_F1 (exp2) (svfloat32_t x, const svbool_t pg)
 {
   const struct data *d = ptr_barrier (&data);
diff --git a/sysdeps/aarch64/fpu/exp_sve.c b/sysdeps/aarch64/fpu/exp_sve.c
index 37de751f9..dc049482e 100644
--- a/sysdeps/aarch64/fpu/exp_sve.c
+++ b/sysdeps/aarch64/fpu/exp_sve.c
@@ -21,12 +21,15 @@
 
 static const struct data
 {
-  double poly[4];
+  double c0, c2;
+  double c1, c3;
   double ln2_hi, ln2_lo, inv_ln2, shift, thres;
+
 } data = {
-  .poly = { /* ulp error: 0.53.  */
-	    0x1.fffffffffdbcdp-2, 0x1.555555555444cp-3, 0x1.555573c6a9f7dp-5,
-	    0x1.1111266d28935p-7 },
+  .c0 = 0x1.fffffffffdbcdp-2,
+  .c1 = 0x1.555555555444cp-3,
+  .c2 = 0x1.555573c6a9f7dp-5,
+  .c3 = 0x1.1111266d28935p-7,
   .ln2_hi = 0x1.62e42fefa3800p-1,
   .ln2_lo = 0x1.ef35793c76730p-45,
   /* 1/ln2.  */
@@ -36,7 +39,6 @@ static const struct data
   .thres = 704.0,
 };
 
-#define C(i) sv_f64 (d->poly[i])
 #define SpecialOffset 0x6000000000000000 /* 0x1p513.  */
 /* SpecialBias1 + SpecialBias1 = asuint(1.0).  */
 #define SpecialBias1 0x7000000000000000 /* 0x1p769.  */
@@ -56,20 +58,20 @@ special_case (svbool_t pg, svfloat64_t s, svfloat64_t y, svfloat64_t n)
   svuint64_t b
       = svdup_u64_z (p_sign, SpecialOffset); /* Inactive lanes set to 0.  */
 
-  /* Set s1 to generate overflow depending on sign of exponent n.  */
-  svfloat64_t s1 = svreinterpret_f64 (
-      svsubr_x (pg, b, SpecialBias1)); /* 0x70...0 - b.  */
-  /* Offset s to avoid overflow in final result if n is below threshold.  */
+  /* Set s1 to generate overflow depending on sign of exponent n,
+     ie. s1 = 0x70...0 - b.  */
+  svfloat64_t s1 = svreinterpret_f64 (svsubr_x (pg, b, SpecialBias1));
+  /* Offset s to avoid overflow in final result if n is below threshold.
+     ie. s2 = as_u64 (s) - 0x3010...0 + b.  */
   svfloat64_t s2 = svreinterpret_f64 (
-      svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2),
-	       b)); /* as_u64 (s) - 0x3010...0 + b.  */
+      svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b));
 
   /* |n| > 1280 => 2^(n) overflows.  */
   svbool_t p_cmp = svacgt (pg, n, 1280.0);
 
-  svfloat64_t r1 = svmul_x (pg, s1, s1);
+  svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
   svfloat64_t r2 = svmla_x (pg, s2, s2, y);
-  svfloat64_t r0 = svmul_x (pg, r2, s1);
+  svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
 
   return svsel (p_cmp, r1, r0);
 }
@@ -103,16 +105,16 @@ svfloat64_t SV_NAME_D1 (exp) (svfloat64_t x, const svbool_t pg)
   svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2);
   svuint64_t u = svreinterpret_u64 (z);
   svfloat64_t n = svsub_x (pg, z, d->shift);
-
+  svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
   /* r = x - n * ln2, r is in [-ln2/(2N), ln2/(2N)].  */
   svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
   svfloat64_t r = svmls_lane (x, n, ln2, 0);
   r = svmls_lane (r, n, ln2, 1);
 
   /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5.  */
-  svfloat64_t r2 = svmul_x (pg, r, r);
-  svfloat64_t p01 = svmla_x (pg, C (0), C (1), r);
-  svfloat64_t p23 = svmla_x (pg, C (2), C (3), r);
+  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+  svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), r, c13, 0);
+  svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), r, c13, 1);
   svfloat64_t p04 = svmla_x (pg, p01, p23, r2);
   svfloat64_t y = svmla_x (pg, r, p04, r2);
 
diff --git a/sysdeps/aarch64/fpu/expf_sve.c b/sysdeps/aarch64/fpu/expf_sve.c
index f9249db8b..c3619975b 100644
--- a/sysdeps/aarch64/fpu/expf_sve.c
+++ b/sysdeps/aarch64/fpu/expf_sve.c
@@ -40,9 +40,9 @@ special_case (svfloat32_t x, svbool_t special, const struct sv_expf_data *d)
 }
 
 /* Optimised single-precision SVE exp function.
-   Worst-case error is 1.04 ulp:
-   SV_NAME_F1 (exp)(0x1.a8eda4p+1) got 0x1.ba74bcp+4
-				  want 0x1.ba74bap+4.  */
+   Worst-case error is 0.88 +0.50 ULP:
+   _ZGVsMxv_expf(-0x1.bba276p-6) got 0x1.f25288p-1
+				want 0x1.f2528ap-1.  */
 svfloat32_t SV_NAME_F1 (exp) (svfloat32_t x, const svbool_t pg)
 {
   const struct data *d = ptr_barrier (&data);
diff --git a/sysdeps/aarch64/fpu/expm1_sve.c b/sysdeps/aarch64/fpu/expm1_sve.c
index d4ba8ccf3..b1d940bd2 100644
--- a/sysdeps/aarch64/fpu/expm1_sve.c
+++ b/sysdeps/aarch64/fpu/expm1_sve.c
@@ -18,82 +18,164 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include "sv_math.h"
-#include "poly_sve_f64.h"
 
-#define SpecialBound 0x1.62b7d369a5aa9p+9
-#define ExponentBias 0x3ff0000000000000
+#define FexpaBound 0x1.4cb5ecef28adap-3 /* 15*ln2/64.  */
+#define SpecialBound 0x1.628c2855bfaddp+9 /* ln(2^(1023 + 1/128)).  */
 
 static const struct data
 {
-  double poly[11];
-  double shift, inv_ln2, special_bound;
-  /* To be loaded in one quad-word.  */
+  double c2, c4;
+  double inv_ln2;
   double ln2_hi, ln2_lo;
+  double c0, c1, c3;
+  double shift, thres;
+  uint64_t expm1_data[32];
 } data = {
-  /* Generated using fpminimax.  */
-  .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5,
-            0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10, 0x1.a01a01affa35dp-13,
-            0x1.a01a018b4ecbbp-16, 0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22,
-            0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, },
-
-  .special_bound = SpecialBound,
-  .inv_ln2 = 0x1.71547652b82fep0,
-  .ln2_hi = 0x1.62e42fefa39efp-1,
-  .ln2_lo = 0x1.abc9e3b39803fp-56,
-  .shift = 0x1.8p52,
+  /* Table emulating FEXPA - 1, for values of FEXPA close to 1.
+  The table holds values of 2^(i/64) - 1, computed in arbitrary precision.
+  The first half of the table stores values associated to i from 0 to 15.
+  The second half of the table stores values associated to i from 0 to -15.  */
+  .expm1_data = {
+      0x0000000000000000, 0x3f864d1f3bc03077, 0x3f966c34c5615d0f, 0x3fa0e8a30eb37901,
+      0x3fa6ab0d9f3121ec, 0x3fac7d865a7a3440, 0x3fb1301d0125b50a, 0x3fb429aaea92ddfb,
+      0x3fb72b83c7d517ae, 0x3fba35beb6fcb754, 0x3fbd4873168b9aa8, 0x3fc031dc431466b2,
+		  0x3fc1c3d373ab11c3, 0x3fc35a2b2f13e6e9, 0x3fc4f4efa8fef709, 0x3fc6942d3720185a,
+      0x0000000000000000, 0xbfc331751ec3a814, 0xbfc20224341286e4, 0xbfc0cf85bed0f8b7,
+      0xbfbf332113d56b1f, 0xbfbcc0768d4175a6, 0xbfba46f918837cb7, 0xbfb7c695afc3b424,
+		  0xbfb53f391822dbc7, 0xbfb2b0cfe1266bd4, 0xbfb01b466423250a, 0xbfaafd11874c009e,
+      0xbfa5b505d5b6f268, 0xbfa05e4119ea5d89, 0xbf95f134923757f3, 0xbf860f9f985bc9f4,
+    },
+
+  /* Generated using Remez, in [-log(2)/128, log(2)/128].  */
+  .c0 = 0x1p-1,
+  .c1 = 0x1.55555555548f9p-3,
+  .c2 = 0x1.5555555554c22p-5,
+  .c3 = 0x1.111123aaa2fb2p-7,
+  .c4 = 0x1.6c16d77d98e5bp-10,
+  .ln2_hi = 0x1.62e42fefa3800p-1,
+  .ln2_lo = 0x1.ef35793c76730p-45,
+  .inv_ln2 = 0x1.71547652b82fep+0,
+  .shift = 0x1.800000000ffc0p+46, /* 1.5*2^46+1023.  */
+  .thres = SpecialBound,
 };
 
-static svfloat64_t NOINLINE
-special_case (svfloat64_t x, svfloat64_t y, svbool_t pg)
+#define SpecialOffset 0x6000000000000000 /* 0x1p513.  */
+/* SpecialBias1 + SpecialBias1 = asuint(1.0).  */
+#define SpecialBias1 0x7000000000000000 /* 0x1p769.  */
+#define SpecialBias2 0x3010000000000000 /* 0x1p-254.  */
+
+static NOINLINE svfloat64_t
+special_case (svbool_t pg, svfloat64_t y, svfloat64_t s, svfloat64_t p,
+	      svfloat64_t n)
 {
-  return sv_call_f64 (expm1, x, y, pg);
+  /* s=2^n may overflow, break it up into s=s1*s2,
+     such that exp = s + s*y can be computed as s1*(s2+s2*y)
+     and s1*s1 overflows only if n>0.  */
+
+  /* If n<=0 then set b to 0x6, 0 otherwise.  */
+  svbool_t p_sign = svcmple (pg, n, 0.0); /* n <= 0.  */
+  svuint64_t b
+      = svdup_u64_z (p_sign, SpecialOffset); /* Inactive lanes set to 0.  */
+
+  /* Set s1 to generate overflow depending on sign of exponent n,
+     ie. s1 = 0x70...0 - b.  */
+  svfloat64_t s1 = svreinterpret_f64 (svsubr_x (pg, b, SpecialBias1));
+  /* Offset s to avoid overflow in final result if n is below threshold.
+     ie. s2 = as_u64 (s) - 0x3010...0 + b.  */
+  svfloat64_t s2 = svreinterpret_f64 (
+      svadd_x (pg, svsub_x (pg, svreinterpret_u64 (s), SpecialBias2), b));
+
+  /* |n| > 1280 => 2^(n) overflows.  */
+  svbool_t p_cmp = svacgt (pg, n, 1280.0);
+
+  svfloat64_t r1 = svmul_x (svptrue_b64 (), s1, s1);
+  svfloat64_t r2 = svmla_x (pg, s2, s2, p);
+  svfloat64_t r0 = svmul_x (svptrue_b64 (), r2, s1);
+
+  svbool_t is_safe = svacle (pg, n, 1023); /* Only correct special lanes.  */
+  return svsel (is_safe, y, svsub_x (pg, svsel (p_cmp, r1, r0), 1.0));
 }
 
-/* Double-precision vector exp(x) - 1 function.
-   The maximum error observed error is 2.18 ULP:
-   _ZGVsMxv_expm1(0x1.634ba0c237d7bp-2) got 0x1.a8b9ea8d66e22p-2
-				       want 0x1.a8b9ea8d66e2p-2.  */
+/* FEXPA based SVE expm1 algorithm.
+   Maximum measured error is 2.81 + 0.5 ULP:
+   _ZGVsMxv_expm1 (0x1.974060e619bfp-3) got 0x1.c290e5858bb53p-3
+				       want 0x1.c290e5858bb5p-3.  */
 svfloat64_t SV_NAME_D1 (expm1) (svfloat64_t x, svbool_t pg)
 {
   const struct data *d = ptr_barrier (&data);
 
-  /* Large, Nan/Inf.  */
-  svbool_t special = svnot_z (pg, svaclt (pg, x, d->special_bound));
-
-  /* Reduce argument to smaller range:
-     Let i = round(x / ln2)
-     and f = x - i * ln2, then f is in [-ln2/2, ln2/2].
-     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
-     where 2^i is exact because i is an integer.  */
-  svfloat64_t shift = sv_f64 (d->shift);
-  svfloat64_t n = svsub_x (pg, svmla_x (pg, shift, x, d->inv_ln2), shift);
-  svint64_t i = svcvt_s64_x (pg, n);
-  svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
-  svfloat64_t f = svmls_lane (x, n, ln2, 0);
-  f = svmls_lane (f, n, ln2, 1);
-
-  /* Approximate expm1(f) using polynomial.
-     Taylor expansion for expm1(x) has the form:
-	 x + ax^2 + bx^3 + cx^4 ....
-     So we calculate the polynomial P(f) = a + bf + cf^2 + ...
-     and assemble the approximation expm1(f) ~= f + f^2 * P(f).  */
-  svfloat64_t f2 = svmul_x (pg, f, f);
-  svfloat64_t f4 = svmul_x (pg, f2, f2);
-  svfloat64_t f8 = svmul_x (pg, f4, f4);
-  svfloat64_t p
-      = svmla_x (pg, f, f2, sv_estrin_10_f64_x (pg, f, f2, f4, f8, d->poly));
-
-  /* Assemble the result.
-   expm1(x) ~= 2^i * (p + 1) - 1
-   Let t = 2^i.  */
-  svint64_t u = svadd_x (pg, svlsl_x (pg, i, 52), ExponentBias);
-  svfloat64_t t = svreinterpret_f64 (u);
-
-  /* expm1(x) ~= p * t + (t - 1).  */
-  svfloat64_t y = svmla_x (pg, svsub_x (pg, t, 1), p, t);
+  svbool_t special = svacgt (pg, x, d->thres);
 
-  if (__glibc_unlikely (svptest_any (pg, special)))
-    return special_case (x, y, special);
+  svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2);
+  svuint64_t u = svreinterpret_u64 (z);
+  svfloat64_t n = svsub_x (pg, z, d->shift);
 
+  /* r = x - n * ln2, r is in [-ln2/128, ln2/128].  */
+  svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
+  svfloat64_t r = x;
+  r = svmls_lane (r, n, ln2, 0);
+  r = svmls_lane (r, n, ln2, 1);
+
+  /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6.  */
+  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+  svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2);
+
+  svfloat64_t p;
+  svfloat64_t c12 = svmla_lane (sv_f64 (d->c1), r, c24, 0);
+  svfloat64_t c34 = svmla_lane (sv_f64 (d->c3), r, c24, 1);
+  p = svmad_x (pg, c34, r2, c12);
+  p = svmad_x (pg, p, r, sv_f64 (d->c0));
+  p = svmad_x (pg, p, r2, r);
+
+  svfloat64_t scale = svexpa (u);
+  svfloat64_t scalem1 = svsub_x (pg, scale, sv_f64 (1.0));
+
+  /* We want to construct expm1(x) = (scale - 1) + scale * poly.
+     However, for values of scale close to 1, scale-1 causes large ULP errors
+     due to cancellation.
+
+     This can be circumvented by using a small lookup for scale-1
+     when our input is below a certain bound, otherwise we can use FEXPA.
+
+     This bound is based upon the table size:
+	   Bound = (TableSize-1/64) * ln2.
+     The current bound is based upon a table size of 16.  */
+  svbool_t is_small = svaclt (pg, x, FexpaBound);
+
+  if (svptest_any (pg, is_small))
+    {
+      /* Index via the input of FEXPA, but we only care about the lower 4 bits.
+       */
+      svuint64_t base_idx = svand_x (pg, u, 0xf);
+
+      /* We can use the sign of x as a fifth bit to account for the asymmetry
+	 of e^x around 0.  */
+      svuint64_t signBit
+	  = svlsl_x (pg, svlsr_x (pg, svreinterpret_u64 (x), 63), 4);
+      svuint64_t idx = svorr_x (pg, base_idx, signBit);
+
+      /* Lookup values for scale - 1 for small x.  */
+      svfloat64_t lookup = svreinterpret_f64 (
+	  svld1_gather_index (is_small, d->expm1_data, idx));
+
+      /* Select the appropriate scale - 1 value based on x.  */
+      scalem1 = svsel (is_small, lookup, scalem1);
+    }
+
+  svfloat64_t y = svmla_x (pg, scalem1, scale, p);
+
+  /* FEXPA returns nan for large inputs so we special case those.  */
+  if (__glibc_unlikely (svptest_any (pg, special)))
+    {
+      /* FEXPA zeroes the sign bit, however the sign is meaningful to the
+          special case function so needs to be copied.
+          e = sign bit of u << 46.  */
+      svuint64_t e = svand_x (pg, svlsl_x (pg, u, 46), 0x8000000000000000);
+      /* Copy sign to s.  */
+      scale = svreinterpret_f64 (svadd_x (pg, e, svreinterpret_u64 (scale)));
+      return special_case (pg, y, scale, p, n);
+    }
+
+  /* return expm1 = (scale - 1) + (scale * poly).  */
   return y;
 }
diff --git a/sysdeps/aarch64/fpu/log1p_sve.c b/sysdeps/aarch64/fpu/log1p_sve.c
index 862c13f81..821c0780a 100644
--- a/sysdeps/aarch64/fpu/log1p_sve.c
+++ b/sysdeps/aarch64/fpu/log1p_sve.c
@@ -22,19 +22,33 @@
 
 static const struct data
 {
-  double poly[19];
+  float64_t c0, c2, c4, c6, c8, c10, c12, c14, c16;
+  float64_t c1, c3, c5, c7, c9, c11, c13, c15, c17, c18;
   double ln2_hi, ln2_lo;
   uint64_t hfrt2_top, onemhfrt2_top, inf, mone;
 } data = {
   /* Generated using Remez in [ sqrt(2)/2 - 1, sqrt(2) - 1]. Order 20
-     polynomial, however first 2 coefficients are 0 and 1 so are not stored.  */
-  .poly = { -0x1.ffffffffffffbp-2, 0x1.55555555551a9p-2, -0x1.00000000008e3p-2,
-	    0x1.9999999a32797p-3, -0x1.555555552fecfp-3, 0x1.249248e071e5ap-3,
-	    -0x1.ffffff8bf8482p-4, 0x1.c71c8f07da57ap-4, -0x1.9999ca4ccb617p-4,
-	    0x1.7459ad2e1dfa3p-4, -0x1.554d2680a3ff2p-4, 0x1.3b4c54d487455p-4,
-	    -0x1.2548a9ffe80e6p-4, 0x1.0f389a24b2e07p-4, -0x1.eee4db15db335p-5,
-	    0x1.e95b494d4a5ddp-5, -0x1.15fdf07cb7c73p-4, 0x1.0310b70800fcfp-4,
-	    -0x1.cfa7385bdb37ep-6, },
+     polynomial, however first 2 coefficients are 0 and 1 so are not
+     stored.  */
+  .c0 = -0x1.ffffffffffffbp-2,
+  .c1 = 0x1.55555555551a9p-2,
+  .c2 = -0x1.00000000008e3p-2,
+  .c3 = 0x1.9999999a32797p-3,
+  .c4 = -0x1.555555552fecfp-3,
+  .c5 = 0x1.249248e071e5ap-3,
+  .c6 = -0x1.ffffff8bf8482p-4,
+  .c7 = 0x1.c71c8f07da57ap-4,
+  .c8 = -0x1.9999ca4ccb617p-4,
+  .c9 = 0x1.7459ad2e1dfa3p-4,
+  .c10 = -0x1.554d2680a3ff2p-4,
+  .c11 = 0x1.3b4c54d487455p-4,
+  .c12 = -0x1.2548a9ffe80e6p-4,
+  .c13 = 0x1.0f389a24b2e07p-4,
+  .c14 = -0x1.eee4db15db335p-5,
+  .c15 = 0x1.e95b494d4a5ddp-5,
+  .c16 = -0x1.15fdf07cb7c73p-4,
+  .c17 = 0x1.0310b70800fcfp-4,
+  .c18 = -0x1.cfa7385bdb37ep-6,
   .ln2_hi = 0x1.62e42fefa3800p-1,
   .ln2_lo = 0x1.ef35793c76730p-45,
   /* top32(asuint64(sqrt(2)/2)) << 32.  */
@@ -49,7 +63,7 @@ static const struct data
 #define BottomMask 0xffffffff
 
 static svfloat64_t NOINLINE
-special_case (svbool_t special, svfloat64_t x, svfloat64_t y)
+special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
 {
   return sv_call_f64 (log1p, x, y, special);
 }
@@ -91,8 +105,9 @@ svfloat64_t SV_NAME_D1 (log1p) (svfloat64_t x, svbool_t pg)
   /* Reduce x to f in [sqrt(2)/2, sqrt(2)].  */
   svuint64_t utop
       = svadd_x (pg, svand_x (pg, u, 0x000fffff00000000), d->hfrt2_top);
-  svuint64_t u_red = svorr_x (pg, utop, svand_x (pg, mi, BottomMask));
-  svfloat64_t f = svsub_x (pg, svreinterpret_f64 (u_red), 1);
+  svuint64_t u_red
+      = svorr_x (pg, utop, svand_x (svptrue_b64 (), mi, BottomMask));
+  svfloat64_t f = svsub_x (svptrue_b64 (), svreinterpret_f64 (u_red), 1);
 
   /* Correction term c/m.  */
   svfloat64_t cm = svdiv_x (pg, svsub_x (pg, x, svsub_x (pg, m, 1)), m);
@@ -103,18 +118,49 @@ svfloat64_t SV_NAME_D1 (log1p) (svfloat64_t x, svbool_t pg)
      Hence approximation has the form f + f^2 * P(f)
      where P(x) = C0 + C1*x + C2x^2 + ...
      Assembling this all correctly is dealt with at the final step.  */
-  svfloat64_t f2 = svmul_x (pg, f, f), f4 = svmul_x (pg, f2, f2),
-	      f8 = svmul_x (pg, f4, f4), f16 = svmul_x (pg, f8, f8);
-  svfloat64_t p = sv_estrin_18_f64_x (pg, f, f2, f4, f8, f16, d->poly);
+  svfloat64_t f2 = svmul_x (svptrue_b64 (), f, f),
+	      f4 = svmul_x (svptrue_b64 (), f2, f2),
+	      f8 = svmul_x (svptrue_b64 (), f4, f4),
+	      f16 = svmul_x (svptrue_b64 (), f8, f8);
+
+  svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
+  svfloat64_t c57 = svld1rq (svptrue_b64 (), &d->c5);
+  svfloat64_t c911 = svld1rq (svptrue_b64 (), &d->c9);
+  svfloat64_t c1315 = svld1rq (svptrue_b64 (), &d->c13);
+  svfloat64_t c1718 = svld1rq (svptrue_b64 (), &d->c17);
+
+  /* Order-18 Estrin scheme.  */
+  svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), f, c13, 0);
+  svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), f, c13, 1);
+  svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), f, c57, 0);
+  svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), f, c57, 1);
+
+  svfloat64_t p03 = svmla_x (pg, p01, f2, p23);
+  svfloat64_t p47 = svmla_x (pg, p45, f2, p67);
+  svfloat64_t p07 = svmla_x (pg, p03, f4, p47);
+
+  svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), f, c911, 0);
+  svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), f, c911, 1);
+  svfloat64_t p1213 = svmla_lane (sv_f64 (d->c12), f, c1315, 0);
+  svfloat64_t p1415 = svmla_lane (sv_f64 (d->c14), f, c1315, 1);
+
+  svfloat64_t p811 = svmla_x (pg, p89, f2, p1011);
+  svfloat64_t p1215 = svmla_x (pg, p1213, f2, p1415);
+  svfloat64_t p815 = svmla_x (pg, p811, f4, p1215);
+
+  svfloat64_t p015 = svmla_x (pg, p07, f8, p815);
+  svfloat64_t p1617 = svmla_lane (sv_f64 (d->c16), f, c1718, 0);
+  svfloat64_t p1618 = svmla_lane (p1617, f2, c1718, 1);
+  svfloat64_t p = svmla_x (pg, p015, f16, p1618);
 
   svfloat64_t ylo = svmla_x (pg, cm, k, d->ln2_lo);
   svfloat64_t yhi = svmla_x (pg, f, k, d->ln2_hi);
-  svfloat64_t y = svmla_x (pg, svadd_x (pg, ylo, yhi), f2, p);
 
   if (__glibc_unlikely (svptest_any (pg, special)))
-    return special_case (special, x, y);
-
-  return y;
+    return special_case (
+	x, svmla_x (svptrue_b64 (), svadd_x (svptrue_b64 (), ylo, yhi), f2, p),
+	special);
+  return svmla_x (svptrue_b64 (), svadd_x (svptrue_b64 (), ylo, yhi), f2, p);
 }
 
 strong_alias (SV_NAME_D1 (log1p), SV_NAME_D1 (logp1))
diff --git a/sysdeps/aarch64/fpu/pow_sve.c b/sysdeps/aarch64/fpu/pow_sve.c
index 42d551ca9..b8c1b39dc 100644
--- a/sysdeps/aarch64/fpu/pow_sve.c
+++ b/sysdeps/aarch64/fpu/pow_sve.c
@@ -44,19 +44,18 @@
 
 /* Data is defined in v_pow_log_data.c.  */
 #define N_LOG (1 << V_POW_LOG_TABLE_BITS)
-#define A __v_pow_log_data.poly
 #define Off 0x3fe6955500000000
 
 /* Data is defined in v_pow_exp_data.c.  */
 #define N_EXP (1 << V_POW_EXP_TABLE_BITS)
 #define SignBias (0x800 << V_POW_EXP_TABLE_BITS)
-#define C __v_pow_exp_data.poly
 #define SmallExp 0x3c9 /* top12(0x1p-54).  */
 #define BigExp 0x408   /* top12(512.).  */
 #define ThresExp 0x03f /* BigExp - SmallExp.  */
 #define HugeExp 0x409  /* top12(1024.).  */
 
 /* Constants associated with pow.  */
+#define SmallBoundX 0x1p-126
 #define SmallPowX 0x001 /* top12(0x1p-126).  */
 #define BigPowX 0x7ff	/* top12(INFINITY).  */
 #define ThresPowX 0x7fe /* BigPowX - SmallPowX.  */
@@ -64,6 +63,31 @@
 #define BigPowY 0x43e	/* top12(0x1.749p62).  */
 #define ThresPowY 0x080 /* BigPowY - SmallPowY.  */
 
+static const struct data
+{
+  double log_c0, log_c2, log_c4, log_c6, ln2_hi, ln2_lo;
+  double log_c1, log_c3, log_c5, off;
+  double n_over_ln2, exp_c2, ln2_over_n_hi, ln2_over_n_lo;
+  double exp_c0, exp_c1;
+} data = {
+  .log_c0 = -0x1p-1,
+  .log_c1 = -0x1.555555555556p-1,
+  .log_c2 = 0x1.0000000000006p-1,
+  .log_c3 = 0x1.999999959554ep-1,
+  .log_c4 = -0x1.555555529a47ap-1,
+  .log_c5 = -0x1.2495b9b4845e9p0,
+  .log_c6 = 0x1.0002b8b263fc3p0,
+  .off = Off,
+  .exp_c0 = 0x1.fffffffffffd4p-2,
+  .exp_c1 = 0x1.5555571d6ef9p-3,
+  .exp_c2 = 0x1.5555576a5adcep-5,
+  .ln2_hi = 0x1.62e42fefa3800p-1,
+  .ln2_lo = 0x1.ef35793c76730p-45,
+  .n_over_ln2 = 0x1.71547652b82fep0 * N_EXP,
+  .ln2_over_n_hi = 0x1.62e42fefc0000p-9,
+  .ln2_over_n_lo = -0x1.c610ca86c3899p-45,
+};
+
 /* Check if x is an integer.  */
 static inline svbool_t
 sv_isint (svbool_t pg, svfloat64_t x)
@@ -82,7 +106,7 @@ sv_isnotint (svbool_t pg, svfloat64_t x)
 static inline svbool_t
 sv_isodd (svbool_t pg, svfloat64_t x)
 {
-  svfloat64_t y = svmul_x (pg, x, 0.5);
+  svfloat64_t y = svmul_x (svptrue_b64 (), x, 0.5);
   return sv_isnotint (pg, y);
 }
 
@@ -121,7 +145,7 @@ zeroinfnan (uint64_t i)
 static inline svbool_t
 sv_zeroinfnan (svbool_t pg, svuint64_t i)
 {
-  return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2), 1),
+  return svcmpge (pg, svsub_x (pg, svadd_x (pg, i, i), 1),
 		  2 * asuint64 (INFINITY) - 1);
 }
 
@@ -174,16 +198,17 @@ sv_call_specialcase (svfloat64_t x1, svuint64_t u1, svuint64_t u2,
    additional 15 bits precision.  IX is the bit representation of x, but
    normalized in the subnormal range using the sign bit for the exponent.  */
 static inline svfloat64_t
-sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail)
+sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail,
+	       const struct data *d)
 {
   /* x = 2^k z; where z is in range [Off,2*Off) and exact.
      The range is split into N subintervals.
      The ith subinterval contains z and c is near its center.  */
-  svuint64_t tmp = svsub_x (pg, ix, Off);
+  svuint64_t tmp = svsub_x (pg, ix, d->off);
   svuint64_t i = svand_x (pg, svlsr_x (pg, tmp, 52 - V_POW_LOG_TABLE_BITS),
 			  sv_u64 (N_LOG - 1));
   svint64_t k = svasr_x (pg, svreinterpret_s64 (tmp), 52);
-  svuint64_t iz = svsub_x (pg, ix, svand_x (pg, tmp, sv_u64 (0xfffULL << 52)));
+  svuint64_t iz = svsub_x (pg, ix, svlsl_x (pg, svreinterpret_u64 (k), 52));
   svfloat64_t z = svreinterpret_f64 (iz);
   svfloat64_t kd = svcvt_f64_x (pg, k);
 
@@ -199,40 +224,85 @@ sv_log_inline (svbool_t pg, svuint64_t ix, svfloat64_t *tail)
      |z/c - 1| < 1/N, so r = z/c - 1 is exactly representible.  */
   svfloat64_t r = svmad_x (pg, z, invc, -1.0);
   /* k*Ln2 + log(c) + r.  */
-  svfloat64_t t1 = svmla_x (pg, logc, kd, __v_pow_log_data.ln2_hi);
+
+  svfloat64_t ln2_hilo = svld1rq_f64 (svptrue_b64 (), &d->ln2_hi);
+  svfloat64_t t1 = svmla_lane_f64 (logc, kd, ln2_hilo, 0);
   svfloat64_t t2 = svadd_x (pg, t1, r);
-  svfloat64_t lo1 = svmla_x (pg, logctail, kd, __v_pow_log_data.ln2_lo);
+  svfloat64_t lo1 = svmla_lane_f64 (logctail, kd, ln2_hilo, 1);
   svfloat64_t lo2 = svadd_x (pg, svsub_x (pg, t1, t2), r);
 
   /* Evaluation is optimized assuming superscalar pipelined execution.  */
-  svfloat64_t ar = svmul_x (pg, r, -0.5); /* A[0] = -0.5.  */
-  svfloat64_t ar2 = svmul_x (pg, r, ar);
-  svfloat64_t ar3 = svmul_x (pg, r, ar2);
+
+  svfloat64_t log_c02 = svld1rq_f64 (svptrue_b64 (), &d->log_c0);
+  svfloat64_t ar = svmul_lane_f64 (r, log_c02, 0);
+  svfloat64_t ar2 = svmul_x (svptrue_b64 (), r, ar);
+  svfloat64_t ar3 = svmul_x (svptrue_b64 (), r, ar2);
   /* k*Ln2 + log(c) + r + A[0]*r*r.  */
   svfloat64_t hi = svadd_x (pg, t2, ar2);
-  svfloat64_t lo3 = svmla_x (pg, svneg_x (pg, ar2), ar, r);
+  svfloat64_t lo3 = svmls_x (pg, ar2, ar, r);
   svfloat64_t lo4 = svadd_x (pg, svsub_x (pg, t2, hi), ar2);
   /* p = log1p(r) - r - A[0]*r*r.  */
   /* p = (ar3 * (A[1] + r * A[2] + ar2 * (A[3] + r * A[4] + ar2 * (A[5] + r *
      A[6])))).  */
-  svfloat64_t a56 = svmla_x (pg, sv_f64 (A[5]), r, A[6]);
-  svfloat64_t a34 = svmla_x (pg, sv_f64 (A[3]), r, A[4]);
-  svfloat64_t a12 = svmla_x (pg, sv_f64 (A[1]), r, A[2]);
+
+  svfloat64_t log_c46 = svld1rq_f64 (svptrue_b64 (), &d->log_c4);
+  svfloat64_t a56 = svmla_lane_f64 (sv_f64 (d->log_c5), r, log_c46, 1);
+  svfloat64_t a34 = svmla_lane_f64 (sv_f64 (d->log_c3), r, log_c46, 0);
+  svfloat64_t a12 = svmla_lane_f64 (sv_f64 (d->log_c1), r, log_c02, 1);
   svfloat64_t p = svmla_x (pg, a34, ar2, a56);
   p = svmla_x (pg, a12, ar2, p);
-  p = svmul_x (pg, ar3, p);
+  p = svmul_x (svptrue_b64 (), ar3, p);
   svfloat64_t lo = svadd_x (
-      pg, svadd_x (pg, svadd_x (pg, svadd_x (pg, lo1, lo2), lo3), lo4), p);
+      pg, svadd_x (pg, svsub_x (pg, svadd_x (pg, lo1, lo2), lo3), lo4), p);
   svfloat64_t y = svadd_x (pg, hi, lo);
   *tail = svadd_x (pg, svsub_x (pg, hi, y), lo);
   return y;
 }
 
+static inline svfloat64_t
+sv_exp_core (svbool_t pg, svfloat64_t x, svfloat64_t xtail,
+	     svuint64_t sign_bias, svfloat64_t *tmp, svuint64_t *sbits,
+	     svuint64_t *ki, const struct data *d)
+{
+  /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)].  */
+  /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N].  */
+  svfloat64_t n_over_ln2_and_c2 = svld1rq_f64 (svptrue_b64 (), &d->n_over_ln2);
+  svfloat64_t z = svmul_lane_f64 (x, n_over_ln2_and_c2, 0);
+  /* z - kd is in [-1, 1] in non-nearest rounding modes.  */
+  svfloat64_t kd = svrinta_x (pg, z);
+  *ki = svreinterpret_u64 (svcvt_s64_x (pg, kd));
+
+  svfloat64_t ln2_over_n_hilo
+      = svld1rq_f64 (svptrue_b64 (), &d->ln2_over_n_hi);
+  svfloat64_t r = x;
+  r = svmls_lane_f64 (r, kd, ln2_over_n_hilo, 0);
+  r = svmls_lane_f64 (r, kd, ln2_over_n_hilo, 1);
+  /* The code assumes 2^-200 < |xtail| < 2^-8/N.  */
+  r = svadd_x (pg, r, xtail);
+  /* 2^(k/N) ~= scale.  */
+  svuint64_t idx = svand_x (pg, *ki, N_EXP - 1);
+  svuint64_t top
+      = svlsl_x (pg, svadd_x (pg, *ki, sign_bias), 52 - V_POW_EXP_TABLE_BITS);
+  /* This is only a valid scale when -1023*N < k < 1024*N.  */
+  *sbits = svld1_gather_index (pg, __v_pow_exp_data.sbits, idx);
+  *sbits = svadd_x (pg, *sbits, top);
+  /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1).  */
+  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+  *tmp = svmla_lane_f64 (sv_f64 (d->exp_c1), r, n_over_ln2_and_c2, 1);
+  *tmp = svmla_x (pg, sv_f64 (d->exp_c0), r, *tmp);
+  *tmp = svmla_x (pg, r, r2, *tmp);
+  svfloat64_t scale = svreinterpret_f64 (*sbits);
+  /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
+     is no spurious underflow here even without fma.  */
+  z = svmla_x (pg, scale, scale, *tmp);
+  return z;
+}
+
 /* Computes sign*exp(x+xtail) where |xtail| < 2^-8/N and |xtail| <= |x|.
    The sign_bias argument is SignBias or 0 and sets the sign to -1 or 1.  */
 static inline svfloat64_t
 sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail,
-	       svuint64_t sign_bias)
+	       svuint64_t sign_bias, const struct data *d)
 {
   /* 3 types of special cases: tiny (uflow and spurious uflow), huge (oflow)
      and other cases of large values of x (scale * (1 + TMP) oflow).  */
@@ -240,73 +310,46 @@ sv_exp_inline (svbool_t pg, svfloat64_t x, svfloat64_t xtail,
   /* |x| is large (|x| >= 512) or tiny (|x| <= 0x1p-54).  */
   svbool_t uoflow = svcmpge (pg, svsub_x (pg, abstop, SmallExp), ThresExp);
 
-  /* Conditions special, uflow and oflow are all expressed as uoflow &&
-     something, hence do not bother computing anything if no lane in uoflow is
-     true.  */
-  svbool_t special = svpfalse_b ();
-  svbool_t uflow = svpfalse_b ();
-  svbool_t oflow = svpfalse_b ();
+  svfloat64_t tmp;
+  svuint64_t sbits, ki;
   if (__glibc_unlikely (svptest_any (pg, uoflow)))
     {
+      svfloat64_t z
+	  = sv_exp_core (pg, x, xtail, sign_bias, &tmp, &sbits, &ki, d);
+
       /* |x| is tiny (|x| <= 0x1p-54).  */
-      uflow = svcmpge (pg, svsub_x (pg, abstop, SmallExp), 0x80000000);
+      svbool_t uflow
+	  = svcmpge (pg, svsub_x (pg, abstop, SmallExp), 0x80000000);
       uflow = svand_z (pg, uoflow, uflow);
       /* |x| is huge (|x| >= 1024).  */
-      oflow = svcmpge (pg, abstop, HugeExp);
+      svbool_t oflow = svcmpge (pg, abstop, HugeExp);
       oflow = svand_z (pg, uoflow, svbic_z (pg, oflow, uflow));
+
       /* For large |x| values (512 < |x| < 1024) scale * (1 + TMP) can overflow
-	 or underflow.  */
-      special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow));
+    or underflow.  */
+      svbool_t special = svbic_z (pg, uoflow, svorr_z (pg, uflow, oflow));
+
+      /* Update result with special and large cases.  */
+      z = sv_call_specialcase (tmp, sbits, ki, z, special);
+
+      /* Handle underflow and overflow.  */
+      svbool_t x_is_neg = svcmplt (pg, x, 0);
+      svuint64_t sign_mask
+	  = svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS);
+      svfloat64_t res_uoflow
+	  = svsel (x_is_neg, sv_f64 (0.0), sv_f64 (INFINITY));
+      res_uoflow = svreinterpret_f64 (
+	  svorr_x (pg, svreinterpret_u64 (res_uoflow), sign_mask));
+      /* Avoid spurious underflow for tiny x.  */
+      svfloat64_t res_spurious_uflow
+	  = svreinterpret_f64 (svorr_x (pg, sign_mask, 0x3ff0000000000000));
+
+      z = svsel (oflow, res_uoflow, z);
+      z = svsel (uflow, res_spurious_uflow, z);
+      return z;
     }
 
-  /* exp(x) = 2^(k/N) * exp(r), with exp(r) in [2^(-1/2N),2^(1/2N)].  */
-  /* x = ln2/N*k + r, with int k and r in [-ln2/2N, ln2/2N].  */
-  svfloat64_t z = svmul_x (pg, x, __v_pow_exp_data.n_over_ln2);
-  /* z - kd is in [-1, 1] in non-nearest rounding modes.  */
-  svfloat64_t shift = sv_f64 (__v_pow_exp_data.shift);
-  svfloat64_t kd = svadd_x (pg, z, shift);
-  svuint64_t ki = svreinterpret_u64 (kd);
-  kd = svsub_x (pg, kd, shift);
-  svfloat64_t r = x;
-  r = svmls_x (pg, r, kd, __v_pow_exp_data.ln2_over_n_hi);
-  r = svmls_x (pg, r, kd, __v_pow_exp_data.ln2_over_n_lo);
-  /* The code assumes 2^-200 < |xtail| < 2^-8/N.  */
-  r = svadd_x (pg, r, xtail);
-  /* 2^(k/N) ~= scale.  */
-  svuint64_t idx = svand_x (pg, ki, N_EXP - 1);
-  svuint64_t top
-      = svlsl_x (pg, svadd_x (pg, ki, sign_bias), 52 - V_POW_EXP_TABLE_BITS);
-  /* This is only a valid scale when -1023*N < k < 1024*N.  */
-  svuint64_t sbits = svld1_gather_index (pg, __v_pow_exp_data.sbits, idx);
-  sbits = svadd_x (pg, sbits, top);
-  /* exp(x) = 2^(k/N) * exp(r) ~= scale + scale * (exp(r) - 1).  */
-  svfloat64_t r2 = svmul_x (pg, r, r);
-  svfloat64_t tmp = svmla_x (pg, sv_f64 (C[1]), r, C[2]);
-  tmp = svmla_x (pg, sv_f64 (C[0]), r, tmp);
-  tmp = svmla_x (pg, r, r2, tmp);
-  svfloat64_t scale = svreinterpret_f64 (sbits);
-  /* Note: tmp == 0 or |tmp| > 2^-200 and scale > 2^-739, so there
-     is no spurious underflow here even without fma.  */
-  z = svmla_x (pg, scale, scale, tmp);
-
-  /* Update result with special and large cases.  */
-  if (__glibc_unlikely (svptest_any (pg, special)))
-    z = sv_call_specialcase (tmp, sbits, ki, z, special);
-
-  /* Handle underflow and overflow.  */
-  svuint64_t sign_bit = svlsr_x (pg, svreinterpret_u64 (x), 63);
-  svbool_t x_is_neg = svcmpne (pg, sign_bit, 0);
-  svuint64_t sign_mask = svlsl_x (pg, sign_bias, 52 - V_POW_EXP_TABLE_BITS);
-  svfloat64_t res_uoflow = svsel (x_is_neg, sv_f64 (0.0), sv_f64 (INFINITY));
-  res_uoflow = svreinterpret_f64 (
-      svorr_x (pg, svreinterpret_u64 (res_uoflow), sign_mask));
-  z = svsel (oflow, res_uoflow, z);
-  /* Avoid spurious underflow for tiny x.  */
-  svfloat64_t res_spurious_uflow
-      = svreinterpret_f64 (svorr_x (pg, sign_mask, 0x3ff0000000000000));
-  z = svsel (uflow, res_spurious_uflow, z);
-
-  return z;
+  return sv_exp_core (pg, x, xtail, sign_bias, &tmp, &sbits, &ki, d);
 }
 
 static inline double
@@ -341,47 +384,39 @@ pow_sc (double x, double y)
 
 svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg)
 {
+  const struct data *d = ptr_barrier (&data);
+
   /* This preamble handles special case conditions used in the final scalar
      fallbacks. It also updates ix and sign_bias, that are used in the core
      computation too, i.e., exp( y * log (x) ).  */
   svuint64_t vix0 = svreinterpret_u64 (x);
   svuint64_t viy0 = svreinterpret_u64 (y);
-  svuint64_t vtopx0 = svlsr_x (svptrue_b64 (), vix0, 52);
 
   /* Negative x cases.  */
-  svuint64_t sign_bit = svlsr_m (pg, vix0, 63);
-  svbool_t xisneg = svcmpeq (pg, sign_bit, 1);
+  svbool_t xisneg = svcmplt (pg, x, 0);
 
   /* Set sign_bias and ix depending on sign of x and nature of y.  */
-  svbool_t yisnotint_xisneg = svpfalse_b ();
+  svbool_t yint_or_xpos = pg;
   svuint64_t sign_bias = sv_u64 (0);
   svuint64_t vix = vix0;
-  svuint64_t vtopx1 = vtopx0;
   if (__glibc_unlikely (svptest_any (pg, xisneg)))
     {
       /* Determine nature of y.  */
-      yisnotint_xisneg = sv_isnotint (xisneg, y);
-      svbool_t yisint_xisneg = sv_isint (xisneg, y);
+      yint_or_xpos = sv_isint (xisneg, y);
       svbool_t yisodd_xisneg = sv_isodd (xisneg, y);
       /* ix set to abs(ix) if y is integer.  */
-      vix = svand_m (yisint_xisneg, vix0, 0x7fffffffffffffff);
-      vtopx1 = svand_m (yisint_xisneg, vtopx0, 0x7ff);
+      vix = svand_m (yint_or_xpos, vix0, 0x7fffffffffffffff);
       /* Set to SignBias if x is negative and y is odd.  */
       sign_bias = svsel (yisodd_xisneg, sv_u64 (SignBias), sv_u64 (0));
     }
 
-  /* Special cases of x or y: zero, inf and nan.  */
-  svbool_t xspecial = sv_zeroinfnan (pg, vix0);
-  svbool_t yspecial = sv_zeroinfnan (pg, viy0);
-  svbool_t special = svorr_z (pg, xspecial, yspecial);
-
   /* Small cases of x: |x| < 0x1p-126.  */
-  svuint64_t vabstopx0 = svand_x (pg, vtopx0, 0x7ff);
-  svbool_t xsmall = svcmplt (pg, vabstopx0, SmallPowX);
-  if (__glibc_unlikely (svptest_any (pg, xsmall)))
+  svbool_t xsmall = svaclt (yint_or_xpos, x, SmallBoundX);
+  if (__glibc_unlikely (svptest_any (yint_or_xpos, xsmall)))
     {
       /* Normalize subnormal x so exponent becomes negative.  */
-      svbool_t topx_is_null = svcmpeq (xsmall, vtopx1, 0);
+      svuint64_t vtopx = svlsr_x (svptrue_b64 (), vix, 52);
+      svbool_t topx_is_null = svcmpeq (xsmall, vtopx, 0);
 
       svuint64_t vix_norm = svreinterpret_u64 (svmul_m (xsmall, x, 0x1p52));
       vix_norm = svand_m (xsmall, vix_norm, 0x7fffffffffffffff);
@@ -391,20 +426,24 @@ svfloat64_t SV_NAME_D2 (pow) (svfloat64_t x, svfloat64_t y, const svbool_t pg)
 
   /* y_hi = log(ix, &y_lo).  */
   svfloat64_t vlo;
-  svfloat64_t vhi = sv_log_inline (pg, vix, &vlo);
+  svfloat64_t vhi = sv_log_inline (yint_or_xpos, vix, &vlo, d);
 
   /* z = exp(y_hi, y_lo, sign_bias).  */
-  svfloat64_t vehi = svmul_x (pg, y, vhi);
-  svfloat64_t velo = svmul_x (pg, y, vlo);
-  svfloat64_t vemi = svmls_x (pg, vehi, y, vhi);
-  velo = svsub_x (pg, velo, vemi);
-  svfloat64_t vz = sv_exp_inline (pg, vehi, velo, sign_bias);
+  svfloat64_t vehi = svmul_x (svptrue_b64 (), y, vhi);
+  svfloat64_t vemi = svmls_x (yint_or_xpos, vehi, y, vhi);
+  svfloat64_t velo = svnmls_x (yint_or_xpos, vemi, y, vlo);
+  svfloat64_t vz = sv_exp_inline (yint_or_xpos, vehi, velo, sign_bias, d);
 
   /* Cases of finite y and finite negative x.  */
-  vz = svsel (yisnotint_xisneg, sv_f64 (__builtin_nan ("")), vz);
+  vz = svsel (yint_or_xpos, vz, sv_f64 (__builtin_nan ("")));
+
+  /* Special cases of x or y: zero, inf and nan.  */
+  svbool_t xspecial = sv_zeroinfnan (svptrue_b64 (), vix0);
+  svbool_t yspecial = sv_zeroinfnan (svptrue_b64 (), viy0);
+  svbool_t special = svorr_z (svptrue_b64 (), xspecial, yspecial);
 
   /* Cases of zero/inf/nan x or y.  */
-  if (__glibc_unlikely (svptest_any (pg, special)))
+  if (__glibc_unlikely (svptest_any (svptrue_b64 (), special)))
     vz = sv_call2_f64 (pow_sc, x, y, vz, special);
 
   return vz;
diff --git a/sysdeps/aarch64/fpu/powf_sve.c b/sysdeps/aarch64/fpu/powf_sve.c
index 29e9acb6f..7046990aa 100644
--- a/sysdeps/aarch64/fpu/powf_sve.c
+++ b/sysdeps/aarch64/fpu/powf_sve.c
@@ -26,7 +26,6 @@
 #define Tlogc __v_powf_data.logc
 #define Texp __v_powf_data.scale
 #define SignBias (1 << (V_POWF_EXP2_TABLE_BITS + 11))
-#define Shift 0x1.8p52
 #define Norm 0x1p23f /* 0x4b000000.  */
 
 /* Overall ULP error bound for pow is 2.6 ulp
@@ -36,7 +35,7 @@ static const struct data
   double log_poly[4];
   double exp_poly[3];
   float uflow_bound, oflow_bound, small_bound;
-  uint32_t sign_bias, sign_mask, subnormal_bias, off;
+  uint32_t sign_bias, subnormal_bias, off;
 } data = {
   /* rel err: 1.5 * 2^-30. Each coefficients is multiplied the value of
      V_POWF_EXP2_N.  */
@@ -53,7 +52,6 @@ static const struct data
   .small_bound = 0x1p-126f,
   .off = 0x3f35d000,
   .sign_bias = SignBias,
-  .sign_mask = 0x80000000,
   .subnormal_bias = 0x0b800000, /* 23 << 23.  */
 };
 
@@ -86,7 +84,7 @@ svisodd (svbool_t pg, svfloat32_t x)
 static inline svbool_t
 sv_zeroinfnan (svbool_t pg, svuint32_t i)
 {
-  return svcmpge (pg, svsub_x (pg, svmul_x (pg, i, 2u), 1),
+  return svcmpge (pg, svsub_x (pg, svadd_x (pg, i, i), 1),
 		  2u * 0x7f800000 - 1);
 }
 
@@ -150,9 +148,14 @@ powf_specialcase (float x, float y, float z)
 }
 
 /* Scalar fallback for special case routines with custom signature.  */
-static inline svfloat32_t
-sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y, svbool_t cmp)
+static svfloat32_t NOINLINE
+sv_call_powf_sc (svfloat32_t x1, svfloat32_t x2, svfloat32_t y)
 {
+  /* Special cases of x or y: zero, inf and nan.  */
+  svbool_t xspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x1));
+  svbool_t yspecial = sv_zeroinfnan (svptrue_b32 (), svreinterpret_u32 (x2));
+  svbool_t cmp = svorr_z (svptrue_b32 (), xspecial, yspecial);
+
   svbool_t p = svpfirst (cmp, svpfalse ());
   while (svptest_any (cmp, p))
     {
@@ -182,30 +185,30 @@ sv_powf_core_ext (const svbool_t pg, svuint64_t i, svfloat64_t z, svint64_t k,
 
   /* Polynomial to approximate log1p(r)/ln2.  */
   svfloat64_t logx = A (0);
-  logx = svmla_x (pg, A (1), r, logx);
-  logx = svmla_x (pg, A (2), r, logx);
-  logx = svmla_x (pg, A (3), r, logx);
-  logx = svmla_x (pg, y0, r, logx);
+  logx = svmad_x (pg, r, logx, A (1));
+  logx = svmad_x (pg, r, logx, A (2));
+  logx = svmad_x (pg, r, logx, A (3));
+  logx = svmad_x (pg, r, logx, y0);
   *pylogx = svmul_x (pg, y, logx);
 
   /* z - kd is in [-1, 1] in non-nearest rounding modes.  */
-  svfloat64_t kd = svadd_x (pg, *pylogx, Shift);
-  svuint64_t ki = svreinterpret_u64 (kd);
-  kd = svsub_x (pg, kd, Shift);
+  svfloat64_t kd = svrinta_x (svptrue_b64 (), *pylogx);
+  svuint64_t ki = svreinterpret_u64 (svcvt_s64_x (svptrue_b64 (), kd));
 
   r = svsub_x (pg, *pylogx, kd);
 
   /* exp2(x) = 2^(k/N) * 2^r ~= s * (C0*r^3 + C1*r^2 + C2*r + 1).  */
-  svuint64_t t
-      = svld1_gather_index (pg, Texp, svand_x (pg, ki, V_POWF_EXP2_N - 1));
-  svuint64_t ski = svadd_x (pg, ki, sign_bias);
-  t = svadd_x (pg, t, svlsl_x (pg, ski, 52 - V_POWF_EXP2_TABLE_BITS));
+  svuint64_t t = svld1_gather_index (
+      svptrue_b64 (), Texp, svand_x (svptrue_b64 (), ki, V_POWF_EXP2_N - 1));
+  svuint64_t ski = svadd_x (svptrue_b64 (), ki, sign_bias);
+  t = svadd_x (svptrue_b64 (), t,
+	       svlsl_x (svptrue_b64 (), ski, 52 - V_POWF_EXP2_TABLE_BITS));
   svfloat64_t s = svreinterpret_f64 (t);
 
   svfloat64_t p = C (0);
   p = svmla_x (pg, C (1), p, r);
   p = svmla_x (pg, C (2), p, r);
-  p = svmla_x (pg, s, p, svmul_x (pg, s, r));
+  p = svmla_x (pg, s, p, svmul_x (svptrue_b64 (), s, r));
 
   return p;
 }
@@ -219,19 +222,16 @@ sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k,
 {
   const svbool_t ptrue = svptrue_b64 ();
 
-  /* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two in
-     order to perform core computation in double precision.  */
+  /* Unpack and promote input vectors (pg, y, z, i, k and sign_bias) into two
+   * in order to perform core computation in double precision.  */
   const svbool_t pg_lo = svunpklo (pg);
   const svbool_t pg_hi = svunpkhi (pg);
-  svfloat64_t y_lo = svcvt_f64_x (
-      ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y))));
-  svfloat64_t y_hi = svcvt_f64_x (
-      ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y))));
-  svfloat32_t z = svreinterpret_f32 (iz);
-  svfloat64_t z_lo = svcvt_f64_x (
-      ptrue, svreinterpret_f32 (svunpklo (svreinterpret_u32 (z))));
-  svfloat64_t z_hi = svcvt_f64_x (
-      ptrue, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (z))));
+  svfloat64_t y_lo
+      = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (svreinterpret_u32 (y))));
+  svfloat64_t y_hi
+      = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (svreinterpret_u32 (y))));
+  svfloat64_t z_lo = svcvt_f64_x (pg, svreinterpret_f32 (svunpklo (iz)));
+  svfloat64_t z_hi = svcvt_f64_x (pg, svreinterpret_f32 (svunpkhi (iz)));
   svuint64_t i_lo = svunpklo (i);
   svuint64_t i_hi = svunpkhi (i);
   svint64_t k_lo = svunpklo (k);
@@ -258,9 +258,9 @@ sv_powf_core (const svbool_t pg, svuint32_t i, svuint32_t iz, svint32_t k,
 /* Implementation of SVE powf.
    Provides the same accuracy as AdvSIMD powf, since it relies on the same
    algorithm. The theoretical maximum error is under 2.60 ULPs.
-   Maximum measured error is 2.56 ULPs:
-   SV_NAME_F2 (pow) (0x1.004118p+0, 0x1.5d14a4p+16) got 0x1.fd4bp+127
-						   want 0x1.fd4b06p+127.  */
+   Maximum measured error is 2.57 ULPs:
+   SV_NAME_F2 (pow) (0x1.031706p+0, 0x1.ce2ec2p+12) got 0x1.fff868p+127
+						   want 0x1.fff862p+127.  */
 svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
 {
   const struct data *d = ptr_barrier (&data);
@@ -269,21 +269,19 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
   svuint32_t viy0 = svreinterpret_u32 (y);
 
   /* Negative x cases.  */
-  svuint32_t sign_bit = svand_m (pg, vix0, d->sign_mask);
-  svbool_t xisneg = svcmpeq (pg, sign_bit, d->sign_mask);
+  svbool_t xisneg = svcmplt (pg, x, sv_f32 (0));
 
   /* Set sign_bias and ix depending on sign of x and nature of y.  */
-  svbool_t yisnotint_xisneg = svpfalse_b ();
+  svbool_t yint_or_xpos = pg;
   svuint32_t sign_bias = sv_u32 (0);
   svuint32_t vix = vix0;
   if (__glibc_unlikely (svptest_any (pg, xisneg)))
     {
       /* Determine nature of y.  */
-      yisnotint_xisneg = svisnotint (xisneg, y);
-      svbool_t yisint_xisneg = svisint (xisneg, y);
+      yint_or_xpos = svisint (xisneg, y);
       svbool_t yisodd_xisneg = svisodd (xisneg, y);
       /* ix set to abs(ix) if y is integer.  */
-      vix = svand_m (yisint_xisneg, vix0, 0x7fffffff);
+      vix = svand_m (yint_or_xpos, vix0, 0x7fffffff);
       /* Set to SignBias if x is negative and y is odd.  */
       sign_bias = svsel (yisodd_xisneg, sv_u32 (d->sign_bias), sv_u32 (0));
     }
@@ -294,8 +292,8 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
   svbool_t cmp = svorr_z (pg, xspecial, yspecial);
 
   /* Small cases of x: |x| < 0x1p-126.  */
-  svbool_t xsmall = svaclt (pg, x, d->small_bound);
-  if (__glibc_unlikely (svptest_any (pg, xsmall)))
+  svbool_t xsmall = svaclt (yint_or_xpos, x, d->small_bound);
+  if (__glibc_unlikely (svptest_any (yint_or_xpos, xsmall)))
     {
       /* Normalize subnormal x so exponent becomes negative.  */
       svuint32_t vix_norm = svreinterpret_u32 (svmul_x (xsmall, x, Norm));
@@ -304,32 +302,35 @@ svfloat32_t SV_NAME_F2 (pow) (svfloat32_t x, svfloat32_t y, const svbool_t pg)
       vix = svsel (xsmall, vix_norm, vix);
     }
   /* Part of core computation carried in working precision.  */
-  svuint32_t tmp = svsub_x (pg, vix, d->off);
-  svuint32_t i = svand_x (pg, svlsr_x (pg, tmp, (23 - V_POWF_LOG2_TABLE_BITS)),
-			  V_POWF_LOG2_N - 1);
-  svuint32_t top = svand_x (pg, tmp, 0xff800000);
-  svuint32_t iz = svsub_x (pg, vix, top);
-  svint32_t k
-      = svasr_x (pg, svreinterpret_s32 (top), (23 - V_POWF_EXP2_TABLE_BITS));
-
-  /* Compute core in extended precision and return intermediate ylogx results to
-      handle cases of underflow and underflow in exp.  */
+  svuint32_t tmp = svsub_x (yint_or_xpos, vix, d->off);
+  svuint32_t i = svand_x (
+      yint_or_xpos, svlsr_x (yint_or_xpos, tmp, (23 - V_POWF_LOG2_TABLE_BITS)),
+      V_POWF_LOG2_N - 1);
+  svuint32_t top = svand_x (yint_or_xpos, tmp, 0xff800000);
+  svuint32_t iz = svsub_x (yint_or_xpos, vix, top);
+  svint32_t k = svasr_x (yint_or_xpos, svreinterpret_s32 (top),
+			 (23 - V_POWF_EXP2_TABLE_BITS));
+
+  /* Compute core in extended precision and return intermediate ylogx results
+   * to handle cases of underflow and underflow in exp.  */
   svfloat32_t ylogx;
-  svfloat32_t ret = sv_powf_core (pg, i, iz, k, y, sign_bias, &ylogx, d);
+  svfloat32_t ret
+      = sv_powf_core (yint_or_xpos, i, iz, k, y, sign_bias, &ylogx, d);
 
   /* Handle exp special cases of underflow and overflow.  */
-  svuint32_t sign = svlsl_x (pg, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS);
+  svuint32_t sign
+      = svlsl_x (yint_or_xpos, sign_bias, 20 - V_POWF_EXP2_TABLE_BITS);
   svfloat32_t ret_oflow
-      = svreinterpret_f32 (svorr_x (pg, sign, asuint (INFINITY)));
+      = svreinterpret_f32 (svorr_x (yint_or_xpos, sign, asuint (INFINITY)));
   svfloat32_t ret_uflow = svreinterpret_f32 (sign);
-  ret = svsel (svcmple (pg, ylogx, d->uflow_bound), ret_uflow, ret);
-  ret = svsel (svcmpgt (pg, ylogx, d->oflow_bound), ret_oflow, ret);
+  ret = svsel (svcmple (yint_or_xpos, ylogx, d->uflow_bound), ret_uflow, ret);
+  ret = svsel (svcmpgt (yint_or_xpos, ylogx, d->oflow_bound), ret_oflow, ret);
 
   /* Cases of finite y and finite negative x.  */
-  ret = svsel (yisnotint_xisneg, sv_f32 (__builtin_nanf ("")), ret);
+  ret = svsel (yint_or_xpos, ret, sv_f32 (__builtin_nanf ("")));
 
-  if (__glibc_unlikely (svptest_any (pg, cmp)))
-    return sv_call_powf_sc (x, y, ret, cmp);
+  if (__glibc_unlikely (svptest_any (cmp, cmp)))
+    return sv_call_powf_sc (x, y, ret);
 
   return ret;
 }
diff --git a/sysdeps/aarch64/fpu/sinh_sve.c b/sysdeps/aarch64/fpu/sinh_sve.c
index 963453f81..072ba8fca 100644
--- a/sysdeps/aarch64/fpu/sinh_sve.c
+++ b/sysdeps/aarch64/fpu/sinh_sve.c
@@ -18,90 +18,153 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include "sv_math.h"
-#include "poly_sve_f64.h"
 
 static const struct data
 {
-  float64_t poly[11];
-  float64_t inv_ln2, m_ln2_hi, m_ln2_lo, shift;
   uint64_t halff;
-  int64_t onef;
-  uint64_t large_bound;
+  double c2, c4;
+  double inv_ln2;
+  double ln2_hi, ln2_lo;
+  double c0, c1, c3;
+  double shift, special_bound, bound;
+  uint64_t expm1_data[20];
 } data = {
-  /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2].  */
-  .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5,
-	    0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10,
-	    0x1.a01a01affa35dp-13, 0x1.a01a018b4ecbbp-16,
-	    0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22,
-	    0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, },
-
-  .inv_ln2 = 0x1.71547652b82fep0,
-  .m_ln2_hi = -0x1.62e42fefa39efp-1,
-  .m_ln2_lo = -0x1.abc9e3b39803fp-56,
-  .shift = 0x1.8p52,
-
+  /* Table lookup of 2^(i/64) - 1, for values of i from 0..19.  */
+  .expm1_data = {
+    0x0000000000000000, 0x3f864d1f3bc03077, 0x3f966c34c5615d0f, 0x3fa0e8a30eb37901,
+    0x3fa6ab0d9f3121ec, 0x3fac7d865a7a3440, 0x3fb1301d0125b50a, 0x3fb429aaea92ddfb,
+    0x3fb72b83c7d517ae, 0x3fba35beb6fcb754, 0x3fbd4873168b9aa8, 0x3fc031dc431466b2,
+    0x3fc1c3d373ab11c3, 0x3fc35a2b2f13e6e9, 0x3fc4f4efa8fef709, 0x3fc6942d3720185a,
+    0x3fc837f0518db8a9, 0x3fc9e0459320b7fa, 0x3fcb8d39b9d54e55, 0x3fcd3ed9a72cffb7,
+  },
+
+  /* Generated using Remez, in [-log(2)/128, log(2)/128].  */
+  .c0 = 0x1p-1,
+  .c1 = 0x1.55555555548f9p-3,
+  .c2 = 0x1.5555555554c22p-5,
+  .c3 = 0x1.111123aaa2fb2p-7,
+  .c4 = 0x1.6c16d77d98e5bp-10,
+  .ln2_hi = 0x1.62e42fefa3800p-1,
+  .ln2_lo = 0x1.ef35793c76730p-45,
+  .inv_ln2 = 0x1.71547652b82fep+0,
+  .shift = 0x1.800000000ffc0p+46, /* 1.5*2^46+1023.  */
   .halff = 0x3fe0000000000000,
-  .onef = 0x3ff0000000000000,
-  /* 2^9. expm1 helper overflows for large input.  */
-  .large_bound = 0x4080000000000000,
+  .special_bound = 0x1.62e37e7d8ba72p+9,	/* ln(2^(1024 - 1/128)).  */
+  .bound = 0x1.a56ef8ec924ccp-3 /* 19*ln2/64.  */
 };
 
+/* A specialised FEXPA expm1 that is only valid for positive inputs and
+   has no special cases. Based off the full FEXPA expm1 implementated for
+   _ZGVsMxv_expm1, with a slightly modified file to keep sinh under 3.5ULP.  */
 static inline svfloat64_t
-expm1_inline (svfloat64_t x, svbool_t pg)
+expm1_inline (svbool_t pg, svfloat64_t x)
 {
   const struct data *d = ptr_barrier (&data);
 
-  /* Reduce argument:
-     exp(x) - 1 = 2^i * (expm1(f) + 1) - 1
-     where i = round(x / ln2)
-     and   f = x - i * ln2 (f in [-ln2/2, ln2/2]).  */
-  svfloat64_t j
-      = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2), d->shift);
-  svint64_t i = svcvt_s64_x (pg, j);
-  svfloat64_t f = svmla_x (pg, x, j, d->m_ln2_hi);
-  f = svmla_x (pg, f, j, d->m_ln2_lo);
-  /* Approximate expm1(f) using polynomial.  */
-  svfloat64_t f2 = svmul_x (pg, f, f);
-  svfloat64_t f4 = svmul_x (pg, f2, f2);
-  svfloat64_t f8 = svmul_x (pg, f4, f4);
-  svfloat64_t p
-      = svmla_x (pg, f, f2, sv_estrin_10_f64_x (pg, f, f2, f4, f8, d->poly));
-  /* t = 2^i.  */
-  svfloat64_t t = svscale_x (pg, sv_f64 (1), i);
-  /* expm1(x) ~= p * t + (t - 1).  */
-  return svmla_x (pg, svsub_x (pg, t, 1.0), p, t);
+  svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2);
+  svuint64_t u = svreinterpret_u64 (z);
+  svfloat64_t n = svsub_x (pg, z, d->shift);
+
+  svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
+  svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2);
+
+  svfloat64_t r = x;
+  r = svmls_lane (r, n, ln2, 0);
+  r = svmls_lane (r, n, ln2, 1);
+
+  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+
+  svfloat64_t p;
+  svfloat64_t c12 = svmla_lane (sv_f64 (d->c1), r, c24, 0);
+  svfloat64_t c34 = svmla_lane (sv_f64 (d->c3), r, c24, 1);
+  p = svmad_x (pg, c34, r2, c12);
+  p = svmad_x (pg, p, r, sv_f64 (d->c0));
+  p = svmad_x (pg, p, r2, r);
+
+  svfloat64_t scale = svexpa (u);
+
+  /* We want to construct expm1(x) = (scale - 1) + scale * poly.
+     However, for values of scale close to 1, scale-1 causes large ULP errors
+     due to cancellation.
+
+     This can be circumvented by using a small lookup for scale-1
+     when our input is below a certain bound, otherwise we can use FEXPA.  */
+  svbool_t is_small = svaclt (pg, x, d->bound);
+
+  /* Index via the input of FEXPA, but we only care about the lower 5 bits.  */
+  svuint64_t base_idx = svand_x (pg, u, 0x1f);
+
+  /* Compute scale - 1 from FEXPA, and lookup values where this fails.  */
+  svfloat64_t scalem1_estimate = svsub_x (pg, scale, sv_f64 (1.0));
+  svuint64_t scalem1_lookup
+      = svld1_gather_index (is_small, d->expm1_data, base_idx);
+
+  /* Select the appropriate scale - 1 value based on x.  */
+  svfloat64_t scalem1
+      = svsel (is_small, svreinterpret_f64 (scalem1_lookup), scalem1_estimate);
+
+  /* return expm1 = scale - 1 + (scale * poly).  */
+  return svmla_x (pg, scalem1, scale, p);
 }
 
+/* Vectorised special case to handle values past where exp_inline overflows.
+   Halves the input value and uses the identity exp(x) = exp(x/2)^2 to double
+   the valid range of inputs, and returns inf for anything past that.  */
 static svfloat64_t NOINLINE
-special_case (svfloat64_t x, svbool_t pg)
+special_case (svbool_t pg, svbool_t special, svfloat64_t ax,
+	      svfloat64_t halfsign, const struct data *d)
 {
-  return sv_call_f64 (sinh, x, x, pg);
+  /* Halves input value, and then check if any cases
+     are still going to overflow.  */
+  ax = svmul_x (special, ax, 0.5);
+  svbool_t is_safe = svaclt (special, ax, d->special_bound);
+
+  svfloat64_t t = expm1_inline (pg, ax);
+
+  /* Finish fastpass to compute values for non-special cases.  */
+  svfloat64_t y = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0)));
+  y = svmul_x (pg, y, halfsign);
+
+  /* Computes special lane, and set remaining overflow lanes to inf.  */
+  svfloat64_t half_special_y = svmul_x (svptrue_b64 (), t, halfsign);
+  svfloat64_t special_y = svmul_x (svptrue_b64 (), half_special_y, t);
+
+  svuint64_t signed_inf
+      = svorr_x (svptrue_b64 (), svreinterpret_u64 (halfsign),
+		 sv_u64 (0x7ff0000000000000));
+  special_y = svsel (is_safe, special_y, svreinterpret_f64 (signed_inf));
+
+  /* Join resulting vectors together and return.  */
+  return svsel (special, special_y, y);
 }
 
-/* Approximation for SVE double-precision sinh(x) using expm1.
-   sinh(x) = (exp(x) - exp(-x)) / 2.
-   The greatest observed error is 2.57 ULP:
-   _ZGVsMxv_sinh (0x1.a008538399931p-2) got 0x1.ab929fc64bd66p-2
-				       want 0x1.ab929fc64bd63p-2.  */
+/* Approximation for SVE double-precision sinh(x) using FEXPA expm1.
+   Uses sinh(x) = e^2x - 1 / 2e^x, rewritten for accuracy.
+   The greatest observed error in the non-special region is 2.63 + 0.5 ULP:
+   _ZGVsMxv_sinh (0x1.b5e0e13ba88aep-2) got 0x1.c3587faf97b0cp-2
+				       want 0x1.c3587faf97b09p-2
+
+   The greatest observed error in the special region is 2.65 + 0.5 ULP:
+   _ZGVsMxv_sinh (0x1.633ce847dab1ap+9) got 0x1.fffd30eea0066p+1023
+				       want 0x1.fffd30eea0063p+1023.  */
 svfloat64_t SV_NAME_D1 (sinh) (svfloat64_t x, svbool_t pg)
 {
   const struct data *d = ptr_barrier (&data);
 
+  svbool_t special = svacge (pg, x, d->special_bound);
   svfloat64_t ax = svabs_x (pg, x);
   svuint64_t sign
       = sveor_x (pg, svreinterpret_u64 (x), svreinterpret_u64 (ax));
   svfloat64_t halfsign = svreinterpret_f64 (svorr_x (pg, sign, d->halff));
 
-  svbool_t special = svcmpge (pg, svreinterpret_u64 (ax), d->large_bound);
-
   /* Fall back to scalar variant for all lanes if any are special.  */
   if (__glibc_unlikely (svptest_any (pg, special)))
-    return special_case (x, pg);
+    return special_case (pg, special, ax, halfsign, d);
 
   /* Up to the point that expm1 overflows, we can use it to calculate sinh
      using a slight rearrangement of the definition of sinh. This allows us to
      retain acceptable accuracy for very small inputs.  */
-  svfloat64_t t = expm1_inline (ax, pg);
+  svfloat64_t t = expm1_inline (pg, ax);
   t = svadd_x (pg, t, svdiv_x (pg, t, svadd_x (pg, t, 1.0)));
   return svmul_x (pg, t, halfsign);
 }
diff --git a/sysdeps/aarch64/fpu/sv_expf_inline.h b/sysdeps/aarch64/fpu/sv_expf_inline.h
index f208d3389..e2d2e906b 100644
--- a/sysdeps/aarch64/fpu/sv_expf_inline.h
+++ b/sysdeps/aarch64/fpu/sv_expf_inline.h
@@ -24,52 +24,41 @@
 
 struct sv_expf_data
 {
-  float c1, c3, inv_ln2;
-  float ln2_lo, c0, c2, c4;
-  float ln2_hi, shift;
+  float ln2_hi, ln2_lo, c1, null;
+  float inv_ln2, shift;
 };
 
-/* Coefficients copied from the polynomial in AdvSIMD variant, reversed for
-   compatibility with polynomial helpers. Shift is 1.5*2^17 + 127.  */
+/* Shift is 1.5*2^17 + 127.  */
 #define SV_EXPF_DATA                                                          \
   {                                                                           \
-    /* Coefficients copied from the polynomial in AdvSIMD variant.  */        \
-    .c0 = 0x1.ffffecp-1f, .c1 = 0x1.fffdb6p-2f, .c2 = 0x1.555e66p-3f,         \
-    .c3 = 0x1.573e2ep-5f, .c4 = 0x1.0e4020p-7f, .inv_ln2 = 0x1.715476p+0f,    \
-    .ln2_hi = 0x1.62e4p-1f, .ln2_lo = 0x1.7f7d1cp-20f,                        \
-    .shift = 0x1.803f8p17f,                                                   \
+    .c1 = 0.5f, .inv_ln2 = 0x1.715476p+0f, .ln2_hi = 0x1.62e4p-1f,            \
+    .ln2_lo = 0x1.7f7d1cp-20f, .shift = 0x1.803f8p17f,                        \
   }
 
-#define C(i) sv_f32 (d->poly[i])
-
 static inline svfloat32_t
 expf_inline (svfloat32_t x, const svbool_t pg, const struct sv_expf_data *d)
 {
   /* exp(x) = 2^n (1 + poly(r)), with 1 + poly(r) in [1/sqrt(2),sqrt(2)]
      x = ln2*n + r, with r in [-ln2/2, ln2/2].  */
 
-  svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->ln2_lo);
+  svfloat32_t lane_consts = svld1rq (svptrue_b32 (), &d->ln2_hi);
 
   /* n = round(x/(ln2/N)).  */
   svfloat32_t z = svmad_x (pg, sv_f32 (d->inv_ln2), x, d->shift);
   svfloat32_t n = svsub_x (pg, z, d->shift);
 
   /* r = x - n*ln2/N.  */
-  svfloat32_t r = svmsb_x (pg, sv_f32 (d->ln2_hi), n, x);
+  svfloat32_t r = x;
   r = svmls_lane (r, n, lane_consts, 0);
+  r = svmls_lane (r, n, lane_consts, 1);
 
   /* scale = 2^(n/N).  */
   svfloat32_t scale = svexpa (svreinterpret_u32 (z));
 
-  /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6.  */
-  svfloat32_t p12 = svmla_lane (sv_f32 (d->c1), r, lane_consts, 2);
-  svfloat32_t p34 = svmla_lane (sv_f32 (d->c3), r, lane_consts, 3);
+  /* poly(r) = exp(r) - 1 ~= r + 0.5 r^2.  */
   svfloat32_t r2 = svmul_x (svptrue_b32 (), r, r);
-  svfloat32_t p14 = svmla_x (pg, p12, p34, r2);
-  svfloat32_t p0 = svmul_lane (r, lane_consts, 1);
-  svfloat32_t poly = svmla_x (pg, p0, r2, p14);
+  svfloat32_t poly = svmla_lane (r, r2, lane_consts, 2);
 
   return svmla_x (pg, scale, scale, poly);
 }
-
 #endif
diff --git a/sysdeps/aarch64/fpu/sv_log1p_inline.h b/sysdeps/aarch64/fpu/sv_log1p_inline.h
index 71f88e02d..c2b196f35 100644
--- a/sysdeps/aarch64/fpu/sv_log1p_inline.h
+++ b/sysdeps/aarch64/fpu/sv_log1p_inline.h
@@ -21,11 +21,12 @@
 #define AARCH64_FPU_SV_LOG1P_INLINE_H
 
 #include "sv_math.h"
-#include "poly_sve_f64.h"
 
 static const struct sv_log1p_data
 {
-  double poly[19], ln2[2];
+  double c0, c2, c4, c6, c8, c10, c12, c14, c16;
+  double c1, c3, c5, c7, c9, c11, c13, c15, c17, c18;
+  double ln2_lo, ln2_hi;
   uint64_t hf_rt2_top;
   uint64_t one_m_hf_rt2_top;
   uint32_t bottom_mask;
@@ -33,15 +34,30 @@ static const struct sv_log1p_data
 } sv_log1p_data = {
   /* Coefficients generated using Remez, deg=20, in [sqrt(2)/2-1, sqrt(2)-1].
    */
-  .poly = { -0x1.ffffffffffffbp-2, 0x1.55555555551a9p-2, -0x1.00000000008e3p-2,
-	    0x1.9999999a32797p-3, -0x1.555555552fecfp-3, 0x1.249248e071e5ap-3,
-	    -0x1.ffffff8bf8482p-4, 0x1.c71c8f07da57ap-4, -0x1.9999ca4ccb617p-4,
-	    0x1.7459ad2e1dfa3p-4, -0x1.554d2680a3ff2p-4, 0x1.3b4c54d487455p-4,
-	    -0x1.2548a9ffe80e6p-4, 0x1.0f389a24b2e07p-4, -0x1.eee4db15db335p-5,
-	    0x1.e95b494d4a5ddp-5, -0x1.15fdf07cb7c73p-4, 0x1.0310b70800fcfp-4,
-	    -0x1.cfa7385bdb37ep-6 },
-  .ln2 = { 0x1.62e42fefa3800p-1, 0x1.ef35793c76730p-45 },
+  .c0 = -0x1.ffffffffffffbp-2,
+  .c1 = 0x1.55555555551a9p-2,
+  .c2 = -0x1.00000000008e3p-2,
+  .c3 = 0x1.9999999a32797p-3,
+  .c4 = -0x1.555555552fecfp-3,
+  .c5 = 0x1.249248e071e5ap-3,
+  .c6 = -0x1.ffffff8bf8482p-4,
+  .c7 = 0x1.c71c8f07da57ap-4,
+  .c8 = -0x1.9999ca4ccb617p-4,
+  .c9 = 0x1.7459ad2e1dfa3p-4,
+  .c10 = -0x1.554d2680a3ff2p-4,
+  .c11 = 0x1.3b4c54d487455p-4,
+  .c12 = -0x1.2548a9ffe80e6p-4,
+  .c13 = 0x1.0f389a24b2e07p-4,
+  .c14 = -0x1.eee4db15db335p-5,
+  .c15 = 0x1.e95b494d4a5ddp-5,
+  .c16 = -0x1.15fdf07cb7c73p-4,
+  .c17 = 0x1.0310b70800fcfp-4,
+  .c18 = -0x1.cfa7385bdb37ep-6,
+  .ln2_lo = 0x1.62e42fefa3800p-1,
+  .ln2_hi = 0x1.ef35793c76730p-45,
+  /* top32(asuint64(sqrt(2)/2)) << 32.  */
   .hf_rt2_top = 0x3fe6a09e00000000,
+  /* (top32(asuint64(1)) - top32(asuint64(sqrt(2)/2))) << 32.  */
   .one_m_hf_rt2_top = 0x00095f6200000000,
   .bottom_mask = 0xffffffff,
   .one_top = 0x3ff
@@ -51,14 +67,14 @@ static inline svfloat64_t
 sv_log1p_inline (svfloat64_t x, const svbool_t pg)
 {
   /* Helper for calculating log(x + 1). Adapted from v_log1p_inline.h, which
-     differs from v_log1p_2u5.c by:
+     differs from advsimd/log1p.c by:
      - No special-case handling - this should be dealt with by the caller.
      - Pairwise Horner polynomial evaluation for improved accuracy.
      - Optionally simulate the shortcut for k=0, used in the scalar routine,
        using svsel, for improved accuracy when the argument to log1p is close
      to 0. This feature is enabled by defining WANT_SV_LOG1P_K0_SHORTCUT as 1
      in the source of the caller before including this file.
-     See sv_log1p_2u1.c for details of the algorithm.  */
+     See sve/log1p.c for details of the algorithm.  */
   const struct sv_log1p_data *d = ptr_barrier (&sv_log1p_data);
   svfloat64_t m = svadd_x (pg, x, 1);
   svuint64_t mi = svreinterpret_u64 (m);
@@ -79,7 +95,7 @@ sv_log1p_inline (svfloat64_t x, const svbool_t pg)
   svfloat64_t cm;
 
 #ifndef WANT_SV_LOG1P_K0_SHORTCUT
-#error                                                                         \
+#error                                                                       \
   "Cannot use sv_log1p_inline.h without specifying whether you need the k0 shortcut for greater accuracy close to 0"
 #elif WANT_SV_LOG1P_K0_SHORTCUT
   /* Shortcut if k is 0 - set correction term to 0 and f to x. The result is
@@ -96,14 +112,46 @@ sv_log1p_inline (svfloat64_t x, const svbool_t pg)
 #endif
 
   /* Approximate log1p(f) on the reduced input using a polynomial.  */
-  svfloat64_t f2 = svmul_x (pg, f, f);
-  svfloat64_t p = sv_pw_horner_18_f64_x (pg, f, f2, d->poly);
+  svfloat64_t f2 = svmul_x (svptrue_b64 (), f, f),
+	      f4 = svmul_x (svptrue_b64 (), f2, f2),
+	      f8 = svmul_x (svptrue_b64 (), f4, f4),
+	      f16 = svmul_x (svptrue_b64 (), f8, f8);
+
+  svfloat64_t c13 = svld1rq (svptrue_b64 (), &d->c1);
+  svfloat64_t c57 = svld1rq (svptrue_b64 (), &d->c5);
+  svfloat64_t c911 = svld1rq (svptrue_b64 (), &d->c9);
+  svfloat64_t c1315 = svld1rq (svptrue_b64 (), &d->c13);
+  svfloat64_t c1718 = svld1rq (svptrue_b64 (), &d->c17);
+
+  /* Order-18 Estrin scheme.  */
+  svfloat64_t p01 = svmla_lane (sv_f64 (d->c0), f, c13, 0);
+  svfloat64_t p23 = svmla_lane (sv_f64 (d->c2), f, c13, 1);
+  svfloat64_t p45 = svmla_lane (sv_f64 (d->c4), f, c57, 0);
+  svfloat64_t p67 = svmla_lane (sv_f64 (d->c6), f, c57, 1);
+
+  svfloat64_t p03 = svmla_x (pg, p01, f2, p23);
+  svfloat64_t p47 = svmla_x (pg, p45, f2, p67);
+  svfloat64_t p07 = svmla_x (pg, p03, f4, p47);
+
+  svfloat64_t p89 = svmla_lane (sv_f64 (d->c8), f, c911, 0);
+  svfloat64_t p1011 = svmla_lane (sv_f64 (d->c10), f, c911, 1);
+  svfloat64_t p1213 = svmla_lane (sv_f64 (d->c12), f, c1315, 0);
+  svfloat64_t p1415 = svmla_lane (sv_f64 (d->c14), f, c1315, 1);
+
+  svfloat64_t p811 = svmla_x (pg, p89, f2, p1011);
+  svfloat64_t p1215 = svmla_x (pg, p1213, f2, p1415);
+  svfloat64_t p815 = svmla_x (pg, p811, f4, p1215);
+
+  svfloat64_t p015 = svmla_x (pg, p07, f8, p815);
+  svfloat64_t p1617 = svmla_lane (sv_f64 (d->c16), f, c1718, 0);
+  svfloat64_t p1618 = svmla_lane (p1617, f2, c1718, 1);
+  svfloat64_t p = svmla_x (pg, p015, f16, p1618);
 
   /* Assemble log1p(x) = k * log2 + log1p(f) + c/m.  */
-  svfloat64_t ylo = svmla_x (pg, cm, k, d->ln2[0]);
-  svfloat64_t yhi = svmla_x (pg, f, k, d->ln2[1]);
+  svfloat64_t ln2_lo_hi = svld1rq (svptrue_b64 (), &d->ln2_lo);
+  svfloat64_t ylo = svmla_lane (cm, k, ln2_lo_hi, 0);
+  svfloat64_t yhi = svmla_lane (f, k, ln2_lo_hi, 1);
 
-  return svmla_x (pg, svadd_x (pg, ylo, yhi), f2, p);
+  return svmad_x (pg, p, f2, svadd_x (pg, ylo, yhi));
 }
-
 #endif
diff --git a/sysdeps/aarch64/fpu/tanh_sve.c b/sysdeps/aarch64/fpu/tanh_sve.c
index 789cc6854..586941901 100644
--- a/sysdeps/aarch64/fpu/tanh_sve.c
+++ b/sysdeps/aarch64/fpu/tanh_sve.c
@@ -18,83 +18,117 @@
    <https://www.gnu.org/licenses/>.  */
 
 #include "sv_math.h"
-#include "poly_sve_f64.h"
 
 static const struct data
 {
-  float64_t poly[11];
-  float64_t inv_ln2, ln2_hi, ln2_lo, shift;
-  uint64_t thresh, tiny_bound;
+  double ln2_hi, ln2_lo;
+  double c2, c4;
+  double c0, c1, c3;
+  double two_over_ln2, shift;
+  uint64_t tiny_bound;
+  double large_bound, fexpa_bound;
+  uint64_t e2xm1_data[20];
 } data = {
-  /* Generated using Remez, deg=12 in [-log(2)/2, log(2)/2].  */
-  .poly = { 0x1p-1, 0x1.5555555555559p-3, 0x1.555555555554bp-5,
-	    0x1.111111110f663p-7, 0x1.6c16c16c1b5f3p-10,
-	    0x1.a01a01affa35dp-13, 0x1.a01a018b4ecbbp-16,
-	    0x1.71ddf82db5bb4p-19, 0x1.27e517fc0d54bp-22,
-	    0x1.af5eedae67435p-26, 0x1.1f143d060a28ap-29, },
-
-  .inv_ln2 = 0x1.71547652b82fep0,
-  .ln2_hi = -0x1.62e42fefa39efp-1,
-  .ln2_lo = -0x1.abc9e3b39803fp-56,
-  .shift = 0x1.8p52,
-
+  /* Generated using Remez, in [-log(2)/128, log(2)/128].  */
+  .c0 = 0x1p-1,
+  .c1 = 0x1.55555555548f9p-3,
+  .c2 = 0x1.5555555554c22p-5,
+  .c3 = 0x1.111123aaa2fb2p-7,
+  .c4 = 0x1.6c16d77d98e5bp-10,
+  .ln2_hi = 0x1.62e42fefa3800p-1,
+  .ln2_lo = 0x1.ef35793c76730p-45,
+  .two_over_ln2 = 0x1.71547652b82fep+1,
+  .shift = 0x1.800000000ffc0p+46,   /* 1.5*2^46+1023.  */
   .tiny_bound = 0x3e40000000000000, /* asuint64 (0x1p-27).  */
-  /* asuint64(0x1.241bf835f9d5fp+4) - asuint64(tiny_bound).  */
-  .thresh = 0x01f241bf835f9d5f,
+  .large_bound = 0x1.30fc1931f09cap+4, /* arctanh(1 - 2^-54).  */
+  .fexpa_bound = 0x1.a56ef8ec924ccp-4,	  /* 19/64 * ln2/2.  */
+  /* Table lookup of 2^(i/64) - 1, for values of i from 0..19.  */
+  .e2xm1_data = {
+    0x0000000000000000, 0x3f864d1f3bc03077, 0x3f966c34c5615d0f, 0x3fa0e8a30eb37901,
+    0x3fa6ab0d9f3121ec, 0x3fac7d865a7a3440, 0x3fb1301d0125b50a, 0x3fb429aaea92ddfb,
+    0x3fb72b83c7d517ae, 0x3fba35beb6fcb754, 0x3fbd4873168b9aa8, 0x3fc031dc431466b2,
+    0x3fc1c3d373ab11c3, 0x3fc35a2b2f13e6e9, 0x3fc4f4efa8fef709, 0x3fc6942d3720185a,
+    0x3fc837f0518db8a9, 0x3fc9e0459320b7fa, 0x3fcb8d39b9d54e55, 0x3fcd3ed9a72cffb7,
+  },
 };
 
+/* An expm1 inspired, FEXPA based helper function that returns an
+   accurate estimate for e^2x - 1. With no special case or support for
+   negative inputs of x.  */
 static inline svfloat64_t
-expm1_inline (svfloat64_t x, const svbool_t pg, const struct data *d)
-{
-  /* Helper routine for calculating exp(x) - 1. Vector port of the helper from
-     the scalar variant of tanh.  */
-
-  /* Reduce argument: f in [-ln2/2, ln2/2], i is exact.  */
-  svfloat64_t j
-      = svsub_x (pg, svmla_x (pg, sv_f64 (d->shift), x, d->inv_ln2), d->shift);
-  svint64_t i = svcvt_s64_x (pg, j);
-  svfloat64_t f = svmla_x (pg, x, j, d->ln2_hi);
-  f = svmla_x (pg, f, j, d->ln2_lo);
-
-  /* Approximate expm1(f) using polynomial.  */
-  svfloat64_t f2 = svmul_x (pg, f, f);
-  svfloat64_t f4 = svmul_x (pg, f2, f2);
-  svfloat64_t p = svmla_x (
-      pg, f, f2,
-      sv_estrin_10_f64_x (pg, f, f2, f4, svmul_x (pg, f4, f4), d->poly));
-
-  /* t = 2 ^ i.  */
-  svfloat64_t t = svscale_x (pg, sv_f64 (1), i);
-  /* expm1(x) = p * t + (t - 1).  */
-  return svmla_x (pg, svsub_x (pg, t, 1), p, t);
-}
-
-static svfloat64_t NOINLINE
-special_case (svfloat64_t x, svfloat64_t y, svbool_t special)
+e2xm1_inline (const svbool_t pg, svfloat64_t x, const struct data *d)
 {
-  return sv_call_f64 (tanh, x, y, special);
+  svfloat64_t z = svmla_x (pg, sv_f64 (d->shift), x, d->two_over_ln2);
+  svuint64_t u = svreinterpret_u64 (z);
+  svfloat64_t n = svsub_x (pg, z, d->shift);
+
+  /* r = x - n * ln2/2, r is in [-ln2/(2N), ln2/(2N)].  */
+  svfloat64_t ln2 = svld1rq (svptrue_b64 (), &d->ln2_hi);
+  svfloat64_t r = svadd_x (pg, x, x);
+  r = svmls_lane (r, n, ln2, 0);
+  r = svmls_lane (r, n, ln2, 1);
+
+  /* y = exp(r) - 1 ~= r + C0 r^2 + C1 r^3 + C2 r^4 + C3 r^5 + C4 r^6.  */
+  svfloat64_t r2 = svmul_x (svptrue_b64 (), r, r);
+  svfloat64_t c24 = svld1rq (svptrue_b64 (), &d->c2);
+
+  svfloat64_t p;
+  svfloat64_t c12 = svmla_lane (sv_f64 (d->c1), r, c24, 0);
+  svfloat64_t c34 = svmla_lane (sv_f64 (d->c3), r, c24, 1);
+  p = svmad_x (pg, c34, r2, c12);
+  p = svmad_x (pg, p, r, sv_f64 (d->c0));
+  p = svmad_x (pg, p, r2, r);
+
+  svfloat64_t scale = svexpa (u);
+
+  /* We want to construct e2xm1(x) = (scale - 1) + scale * poly.
+     However, for values of scale close to 1, scale-1 causes large ULP errors
+     due to cancellation.
+
+     This can be circumvented by using a small lookup for scale-1
+     when our input is below a certain bound, otherwise we can use FEXPA.  */
+  svbool_t is_small = svaclt (pg, x, d->fexpa_bound);
+
+  /* Index via the input of FEXPA, but we only care about the lower 5 bits.  */
+  svuint64_t base_idx = svand_x (pg, u, 0x1f);
+
+  /* Compute scale - 1 from FEXPA, and lookup values where this fails.  */
+  svfloat64_t scalem1_estimate = svsub_x (pg, scale, sv_f64 (1.0));
+  svuint64_t scalem1_lookup
+      = svld1_gather_index (is_small, d->e2xm1_data, base_idx);
+
+  /* Select the appropriate scale - 1 value based on x.  */
+  svfloat64_t scalem1
+      = svsel (is_small, svreinterpret_f64 (scalem1_lookup), scalem1_estimate);
+  return svmla_x (pg, scalem1, scale, p);
 }
 
-/* SVE approximation for double-precision tanh(x), using a simplified
-   version of expm1. The greatest observed error is 2.77 ULP:
-   _ZGVsMxv_tanh(-0x1.c4a4ca0f9f3b7p-3) got -0x1.bd6a21a163627p-3
-				       want -0x1.bd6a21a163624p-3.  */
+/* SVE approximation for double-precision tanh(x), using a modified version of
+   FEXPA expm1 to calculate e^2x - 1.
+   The greatest observed error is 2.79 + 0.5 ULP:
+   _ZGVsMxv_tanh (0x1.fff868eb3c223p-9) got 0x1.fff7be486cae6p-9
+				       want 0x1.fff7be486cae9p-9.  */
 svfloat64_t SV_NAME_D1 (tanh) (svfloat64_t x, svbool_t pg)
 {
   const struct data *d = ptr_barrier (&data);
 
-  svuint64_t ia = svreinterpret_u64 (svabs_x (pg, x));
+  svbool_t large = svacge (pg, x, d->large_bound);
 
-  /* Trigger special-cases for tiny, boring and infinity/NaN.  */
-  svbool_t special = svcmpgt (pg, svsub_x (pg, ia, d->tiny_bound), d->thresh);
+  /* We can use tanh(x) = (e^2x - 1) / (e^2x + 1) to approximate tanh.
+  As an additional optimisation, we can ensure more accurate values of e^x
+  by only using positive inputs. So we calculate tanh(|x|), and restore the
+  sign of the input before returning.  */
+  svfloat64_t ax = svabs_x (pg, x);
+  svuint64_t sign_bit
+      = sveor_x (pg, svreinterpret_u64 (x), svreinterpret_u64 (ax));
 
-  svfloat64_t u = svadd_x (pg, x, x);
+  svfloat64_t p = e2xm1_inline (pg, ax, d);
+  svfloat64_t q = svadd_x (pg, p, 2);
 
-  /* tanh(x) = (e^2x - 1) / (e^2x + 1).  */
-  svfloat64_t q = expm1_inline (u, pg, d);
-  svfloat64_t qp2 = svadd_x (pg, q, 2);
+  /* For sufficiently high inputs, the result of tanh(|x|) is 1 when correctly
+     rounded, at this point we can return 1 directly, with sign correction.
+     This will also act as a guard against our approximation overflowing.  */
+  svfloat64_t y = svsel (large, sv_f64 (1.0), svdiv_x (pg, p, q));
 
-  if (__glibc_unlikely (svptest_any (pg, special)))
-    return special_case (x, svdiv_x (pg, q, qp2), special);
-  return svdiv_x (pg, q, qp2);
+  return svreinterpret_f64 (svorr_x (pg, sign_bit, svreinterpret_u64 (y)));
 }
diff --git a/sysdeps/aarch64/multiarch/Makefile b/sysdeps/aarch64/multiarch/Makefile
index 772b16a35..1c3c39251 100644
--- a/sysdeps/aarch64/multiarch/Makefile
+++ b/sysdeps/aarch64/multiarch/Makefile
@@ -14,6 +14,7 @@ sysdep_routines += \
   memset_kunpeng \
   memset_mops \
   memset_oryon1 \
+  memset_sve_zva64 \
   memset_zva64 \
   strlen_asimd \
   strlen_generic \
diff --git a/sysdeps/aarch64/multiarch/ifunc-impl-list.c b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
index 0481e450b..8dc314b67 100644
--- a/sysdeps/aarch64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/aarch64/multiarch/ifunc-impl-list.c
@@ -57,6 +57,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_kunpeng)
 #if HAVE_AARCH64_SVE_ASM
 	      IFUNC_IMPL_ADD (array, i, memset, sve && !bti && zva_size == 256, __memset_a64fx)
+	      IFUNC_IMPL_ADD (array, i, memset, sve && zva_size == 64, __memset_sve_zva64)
 #endif
 	      IFUNC_IMPL_ADD (array, i, memset, mops, __memset_mops)
 	      IFUNC_IMPL_ADD (array, i, memset, 1, __memset_generic))
diff --git a/sysdeps/aarch64/multiarch/memset.c b/sysdeps/aarch64/multiarch/memset.c
index f6194e4a9..872f39f00 100644
--- a/sysdeps/aarch64/multiarch/memset.c
+++ b/sysdeps/aarch64/multiarch/memset.c
@@ -36,6 +36,7 @@ extern __typeof (__redirect_memset) __memset_a64fx attribute_hidden;
 extern __typeof (__redirect_memset) __memset_generic attribute_hidden;
 extern __typeof (__redirect_memset) __memset_mops attribute_hidden;
 extern __typeof (__redirect_memset) __memset_oryon1 attribute_hidden;
+extern __typeof (__redirect_memset) __memset_sve_zva64 attribute_hidden;
 
 static inline __typeof (__redirect_memset) *
 select_memset_ifunc (void)
@@ -49,6 +50,9 @@ select_memset_ifunc (void)
     {
       if (IS_A64FX (midr) && zva_size == 256)
 	return __memset_a64fx;
+
+      if (prefer_sve_ifuncs && zva_size == 64)
+	return __memset_sve_zva64;
     }
 
   if (IS_ORYON1 (midr) && zva_size == 64)
diff --git a/sysdeps/aarch64/multiarch/memset_sve_zva64.S b/sysdeps/aarch64/multiarch/memset_sve_zva64.S
new file mode 100644
index 000000000..7fb40fdd9
--- /dev/null
+++ b/sysdeps/aarch64/multiarch/memset_sve_zva64.S
@@ -0,0 +1,123 @@
+/* Optimized memset for SVE.
+   Copyright (C) 2025 Free Software Foundation, Inc.
+
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library.  If not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdep.h>
+
+/* Assumptions:
+ *
+ * ARMv8-a, AArch64, Advanced SIMD, SVE, unaligned accesses.
+ * ZVA size is 64.
+ */
+
+#if HAVE_AARCH64_SVE_ASM
+
+.arch armv8.2-a+sve
+
+#define dstin	x0
+#define val	x1
+#define valw	w1
+#define count	x2
+#define dst	x3
+#define dstend	x4
+#define zva_val	x5
+#define vlen	x5
+#define off	x3
+#define dstend2 x5
+
+ENTRY (__memset_sve_zva64)
+	dup	v0.16B, valw
+	cmp	count, 16
+	b.lo	L(set_16)
+
+	add	dstend, dstin, count
+	cmp	count, 64
+	b.hs	L(set_128)
+
+	/* Set 16..63 bytes.  */
+	mov	off, 16
+	and	off, off, count, lsr 1
+	sub	dstend2, dstend, off
+	str	q0, [dstin]
+	str	q0, [dstin, off]
+	str	q0, [dstend2, -16]
+	str	q0, [dstend, -16]
+	ret
+
+	.p2align 4
+L(set_16):
+	whilelo p0.b, xzr, count
+	st1b	z0.b, p0, [dstin]
+	ret
+
+	.p2align 4
+L(set_128):
+	bic	dst, dstin, 15
+	cmp	count, 128
+	b.hi	L(set_long)
+	stp	q0, q0, [dstin]
+	stp	q0, q0, [dstin, 32]
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+	.p2align 4
+L(set_long):
+	cmp	count, 256
+	b.lo	L(no_zva)
+	tst	valw, 255
+	b.ne	L(no_zva)
+
+	str	q0, [dstin]
+	str	q0, [dst, 16]
+	bic	dst, dstin, 31
+	stp	q0, q0, [dst, 32]
+	bic	dst, dstin, 63
+	sub	count, dstend, dst	/* Count is now 64 too large.  */
+	sub	count, count, 128	/* Adjust count and bias for loop.  */
+
+	sub	x8, dstend, 1		/* Write last bytes before ZVA loop.  */
+	bic	x8, x8, 15
+	stp	q0, q0, [x8, -48]
+	str	q0, [x8, -16]
+	str	q0, [dstend, -16]
+
+	.p2align 4
+L(zva64_loop):
+	add	dst, dst, 64
+	dc	zva, dst
+	subs	count, count, 64
+	b.hi	L(zva64_loop)
+	ret
+
+L(no_zva):
+	str	q0, [dstin]
+	sub	count, dstend, dst	/* Count is 16 too large.  */
+	sub	count, count, 64 + 16	/* Adjust count and bias for loop.  */
+L(no_zva_loop):
+	stp	q0, q0, [dst, 16]
+	stp	q0, q0, [dst, 48]
+	add	dst, dst, 64
+	subs	count, count, 64
+	b.hi	L(no_zva_loop)
+	stp	q0, q0, [dstend, -64]
+	stp	q0, q0, [dstend, -32]
+	ret
+
+END (__memset_sve_zva64)
+#endif
diff --git a/sysdeps/arm/find_exidx.c b/sysdeps/arm/find_exidx.c
index 60021a072..468e01621 100644
--- a/sysdeps/arm/find_exidx.c
+++ b/sysdeps/arm/find_exidx.c
@@ -15,6 +15,7 @@
    License along with the GNU C Library.  If not, see
    <https://www.gnu.org/licenses/>.  */
 
+#include <ldsodefs.h>
 #include <link.h>
 
 /* Find the exception index table containing PC.  */
@@ -23,7 +24,7 @@ _Unwind_Ptr
 __gnu_Unwind_Find_exidx (_Unwind_Ptr pc, int * pcount)
 {
   struct dl_find_object data;
-  if (__dl_find_object ((void *) pc, &data) < 0)
+  if (GLRO(dl_find_object) ((void *) pc, &data) < 0)
     return 0;
   *pcount = data.dlfo_eh_count;
   return (_Unwind_Ptr) data.dlfo_eh_frame;
diff --git a/sysdeps/generic/ldsodefs.h b/sysdeps/generic/ldsodefs.h
index e871f27ff..ddb34a158 100644
--- a/sysdeps/generic/ldsodefs.h
+++ b/sysdeps/generic/ldsodefs.h
@@ -695,10 +695,23 @@ extern const ElfW(Phdr) *_dl_phdr;
 extern size_t _dl_phnum;
 #endif
 
+/* Possible values for the glibc.rtld.execstack tunable.  */
+enum stack_tunable_mode
+  {
+    /* Do not allow executable stacks, even if program requires it.  */
+    stack_tunable_mode_disable = 0,
+    /* Follows either ABI requirement, or the PT_GNU_STACK value.  */
+    stack_tunable_mode_enable = 1,
+    /* Always enable an executable stack.  */
+    stack_tunable_mode_force = 2
+  };
+
+void _dl_handle_execstack_tunable (void) attribute_hidden;
+
 /* This function changes the permission of the memory region pointed
    by STACK_ENDP to executable and update the internal memory protection
    flags for future thread stack creation.  */
-int _dl_make_stack_executable (void **stack_endp) attribute_hidden;
+int _dl_make_stack_executable (const void *stack_endp) attribute_hidden;
 
 /* Variable pointing to the end of the stack (or close to it).  This value
    must be constant over the runtime of the application.  Some programs
diff --git a/sysdeps/ieee754/dbl-64/e_atanh.c b/sysdeps/ieee754/dbl-64/e_atanh.c
index 1e09e46f0..d1c71b2aa 100644
--- a/sysdeps/ieee754/dbl-64/e_atanh.c
+++ b/sysdeps/ieee754/dbl-64/e_atanh.c
@@ -44,6 +44,11 @@
 
 static const double huge = 1e300;
 
+#ifndef SECTION
+# define SECTION
+#endif
+
+SECTION
 double
 __ieee754_atanh (double x)
 {
@@ -73,4 +78,7 @@ __ieee754_atanh (double x)
 
   return copysign (t, x);
 }
+
+#ifndef __ieee754_atanh
 libm_alias_finite (__ieee754_atanh, __atanh)
+#endif
diff --git a/sysdeps/ieee754/dbl-64/e_sinh.c b/sysdeps/ieee754/dbl-64/e_sinh.c
index b4b5857dd..3f787967f 100644
--- a/sysdeps/ieee754/dbl-64/e_sinh.c
+++ b/sysdeps/ieee754/dbl-64/e_sinh.c
@@ -41,6 +41,11 @@ static char rcsid[] = "$NetBSD: e_sinh.c,v 1.7 1995/05/10 20:46:13 jtc Exp $";
 
 static const double one = 1.0, shuge = 1.0e307;
 
+#ifndef SECTION
+# define SECTION
+#endif
+
+SECTION
 double
 __ieee754_sinh (double x)
 {
@@ -90,4 +95,7 @@ __ieee754_sinh (double x)
   /* |x| > overflowthresold, sinh(x) overflow */
   return math_narrow_eval (x * shuge);
 }
+
+#ifndef __ieee754_sinh
 libm_alias_finite (__ieee754_sinh, __sinh)
+#endif
diff --git a/sysdeps/ieee754/dbl-64/math_config.h b/sysdeps/ieee754/dbl-64/math_config.h
index 299a2ff8c..3382e385f 100644
--- a/sysdeps/ieee754/dbl-64/math_config.h
+++ b/sysdeps/ieee754/dbl-64/math_config.h
@@ -195,16 +195,18 @@ check_uflow (double x)
 extern const struct exp_data
 {
   double invln2N;
-  double shift;
   double negln2hiN;
   double negln2loN;
   double poly[4]; /* Last four coefficients.  */
+  double shift;
+
   double exp2_shift;
   double exp2_poly[EXP2_POLY_ORDER];
-  double invlog10_2N;
+
   double neglog10_2hiN;
   double neglog10_2loN;
   double exp10_poly[5];
+  double invlog10_2N;
   uint64_t tab[2*(1 << EXP_TABLE_BITS)];
 } __exp_data attribute_hidden;
 
diff --git a/sysdeps/ieee754/dbl-64/s_fma.c b/sysdeps/ieee754/dbl-64/s_fma.c
index 20f617b99..42351c6b3 100644
--- a/sysdeps/ieee754/dbl-64/s_fma.c
+++ b/sysdeps/ieee754/dbl-64/s_fma.c
@@ -244,6 +244,9 @@ __fma (double x, double y, double z)
   /* Reset rounding mode and test for inexact simultaneously.  */
   int j = libc_feupdateenv_test (&env, FE_INEXACT) != 0;
 
+  /* Ensure value of a1 + u.d is not reused.  */
+  a1 = math_opt_barrier (a1);
+
   if (__glibc_likely (adjust == 0))
     {
       if ((u.ieee.mantissa1 & 1) == 0 && u.ieee.exponent != 0x7ff)
diff --git a/sysdeps/ieee754/dbl-64/s_tanh.c b/sysdeps/ieee754/dbl-64/s_tanh.c
index 673a97102..13063db04 100644
--- a/sysdeps/ieee754/dbl-64/s_tanh.c
+++ b/sysdeps/ieee754/dbl-64/s_tanh.c
@@ -46,6 +46,11 @@ static char rcsid[] = "$NetBSD: s_tanh.c,v 1.7 1995/05/10 20:48:22 jtc Exp $";
 
 static const double one = 1.0, two = 2.0, tiny = 1.0e-300;
 
+#ifndef SECTION
+# define SECTION
+#endif
+
+SECTION
 double
 __tanh (double x)
 {
diff --git a/sysdeps/ieee754/flt-32/e_sinhf.c b/sysdeps/ieee754/flt-32/e_sinhf.c
index c007c7d17..dee96fc7c 100644
--- a/sysdeps/ieee754/flt-32/e_sinhf.c
+++ b/sysdeps/ieee754/flt-32/e_sinhf.c
@@ -83,7 +83,7 @@ __ieee754_sinhf (float x)
 	{					   /* |x| <= 0x1.250bfep-11 */
 	  if (__glibc_unlikely (ux < 0x66000000u)) /* |x| < 0x1p-24 */
 	    return fmaf (x, fabsf (x), x);
-	  if (__glibc_unlikely (st.uarg == asuint (ux)))
+	  if (__glibc_unlikely (st.uarg == ux))
 	    {
 	      float sgn = copysignf (1.0f, x);
 	      return sgn * st.rh + sgn * st.rl;
diff --git a/sysdeps/ieee754/flt-32/s_log10p1f.c b/sysdeps/ieee754/flt-32/s_log10p1f.c
index 64deb1eed..4e11d55d4 100644
--- a/sysdeps/ieee754/flt-32/s_log10p1f.c
+++ b/sysdeps/ieee754/flt-32/s_log10p1f.c
@@ -70,7 +70,7 @@ __log10p1f (float x)
     };
   static const double tl[] =
     {
-      0x1.562ec497ef351p-43, 0x1.b9476892ea99cp-8, 0x1.b5e909c959eecp-7,
+     -0x1.562ec497ef351p-43, 0x1.b9476892ea99cp-8, 0x1.b5e909c959eecp-7,
       0x1.45f4f59ec84fp-6,   0x1.af5f92cbcf2aap-6, 0x1.0ba01a6069052p-5,
       0x1.3ed119b99dd41p-5,  0x1.714834298a088p-5, 0x1.a30a9d98309c1p-5,
       0x1.d41d51266b9d9p-5,  0x1.02428c0f62dfcp-4, 0x1.1a23444eea521p-4,
diff --git a/sysdeps/ieee754/flt-32/s_tanf.c b/sysdeps/ieee754/flt-32/s_tanf.c
index dfe56fc2a..5ee1d6f35 100644
--- a/sysdeps/ieee754/flt-32/s_tanf.c
+++ b/sysdeps/ieee754/flt-32/s_tanf.c
@@ -166,7 +166,7 @@ __tanf (float x)
       uint32_t sgn = t >> 31;
       for (int j = 0; j < array_length (st); j++)
 	{
-	  if (__glibc_unlikely (asfloat (st[j].arg) == ax))
+	  if (__glibc_unlikely (asuint (st[j].arg) == ax))
 	    {
 	      if (sgn)
 		return -st[j].rh - st[j].rl;
diff --git a/sysdeps/mach/hurd/dl-execstack.c b/sysdeps/mach/hurd/dl-execstack.c
index 0617d3a16..dc4719bd3 100644
--- a/sysdeps/mach/hurd/dl-execstack.c
+++ b/sysdeps/mach/hurd/dl-execstack.c
@@ -26,12 +26,11 @@ extern struct hurd_startup_data *_dl_hurd_data attribute_hidden;
    so as to mprotect it.  */
 
 int
-_dl_make_stack_executable (void **stack_endp)
+_dl_make_stack_executable (const void *stack_endp)
 {
   /* Challenge the caller.  */
-  if (__builtin_expect (*stack_endp != __libc_stack_end, 0))
+  if (__glibc_unlikely (stack_endp != __libc_stack_end))
     return EPERM;
-  *stack_endp = NULL;
 
 #if IS_IN (rtld)
   if (__mprotect ((void *)_dl_hurd_data->stack_base, _dl_hurd_data->stack_size,
diff --git a/sysdeps/nptl/bits/thread-shared-types.h b/sysdeps/nptl/bits/thread-shared-types.h
index 7c24c0a6b..e614c7f3c 100644
--- a/sysdeps/nptl/bits/thread-shared-types.h
+++ b/sysdeps/nptl/bits/thread-shared-types.h
@@ -99,6 +99,8 @@ struct __pthread_cond_s
   unsigned int __g1_orig_size;
   unsigned int __wrefs;
   unsigned int __g_signals[2];
+  unsigned int __unused_initialized_1;
+  unsigned int __unused_initialized_2;
 };
 
 typedef unsigned int __tss_t;
diff --git a/sysdeps/nptl/dl-tls_init_tp.c b/sysdeps/nptl/dl-tls_init_tp.c
index f487bfb66..8629b5d41 100644
--- a/sysdeps/nptl/dl-tls_init_tp.c
+++ b/sysdeps/nptl/dl-tls_init_tp.c
@@ -23,6 +23,7 @@
 #include <tls.h>
 #include <rseq-internal.h>
 #include <thread_pointer.h>
+#include <dl-symbol-redir-ifunc.h>
 
 #define TUNABLE_NAMESPACE pthread
 #include <dl-tunables.h>
diff --git a/sysdeps/nptl/pthread.h b/sysdeps/nptl/pthread.h
index 050b4ab8d..9ad36cabe 100644
--- a/sysdeps/nptl/pthread.h
+++ b/sysdeps/nptl/pthread.h
@@ -152,7 +152,7 @@ enum
 
 
 /* Conditional variable handling.  */
-#define PTHREAD_COND_INITIALIZER { { {0}, {0}, {0, 0}, 0, 0, {0, 0} } }
+#define PTHREAD_COND_INITIALIZER { { {0}, {0}, {0, 0}, 0, 0, {0, 0}, 0, 0 } }
 
 
 /* Cleanup buffers */
diff --git a/sysdeps/powerpc/powerpc64/le/power10/memchr.S b/sysdeps/powerpc/powerpc64/le/power10/memchr.S
deleted file mode 100644
index 96ad5a2e1..000000000
--- a/sysdeps/powerpc/powerpc64/le/power10/memchr.S
+++ /dev/null
@@ -1,315 +0,0 @@
-/* Optimized memchr implementation for POWER10 LE.
-   Copyright (C) 2021-2025 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.	 See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-# ifndef MEMCHR
-#  define MEMCHR __memchr
-# endif
-# define M_VREG_ZERO v20
-# define M_OFF_START_LOOP 256
-# define MEMCHR_SUBTRACT_VECTORS \
-	vsububm   v4,v4,v18;	    \
-	vsububm   v5,v5,v18;	    \
-	vsububm   v6,v6,v18;	    \
-	vsububm   v7,v7,v18;
-# define M_TAIL(vreg,increment)	   \
-	vctzlsbb  r4,vreg;	   \
-	cmpld     r5,r4;	   \
-	ble       L(null);	   \
-	addi	  r4,r4,increment; \
-	add	  r3,r6,r4;	   \
-	blr
-
-/* TODO: Replace macros by the actual instructions when minimum binutils becomes
-   >= 2.35.  This is used to keep compatibility with older versions.  */
-#define M_VEXTRACTBM(rt,vrb)	 \
-	.long(((4)<<(32-6))	 \
-	      | ((rt)<<(32-11))	 \
-	      | ((8)<<(32-16))	 \
-	      | ((vrb)<<(32-21)) \
-	      | 1602)
-
-#define M_LXVP(xtp,dq,ra)		   \
-	.long(((6)<<(32-6))		   \
-	      | ((((xtp)-32)>>1)<<(32-10)) \
-	      | ((1)<<(32-11))		   \
-	      | ((ra)<<(32-16))		   \
-	      | dq)
-
-#define CHECK16B(vreg,offset,addr,label) \
-	lxv	  vreg+32,offset(addr);	\
-	vcmpequb. vreg,vreg,v18;	\
-	bne	  cr6,L(label);		\
-	cmpldi	  r5,16;		\
-	ble	  L(null);		\
-	addi	  r5,r5,-16;
-
-/* Load 4 quadwords, merge into one VR for speed and check for NULLs.  r6 has #
-   of bytes already checked.  */
-#define CHECK64B(offset,addr,label)	    \
-	M_LXVP(v4+32,offset,addr);	    \
-	M_LXVP(v6+32,offset+32,addr);	    \
-	MEMCHR_SUBTRACT_VECTORS;	    \
-	vminub	  v14,v4,v5;		    \
-	vminub	  v15,v6,v7;		    \
-	vminub	  v16,v14,v15;		    \
-	vcmpequb. v0,v16,M_VREG_ZERO;	    \
-	beq	  cr6,$+12;		    \
-	li	  r7,offset;		    \
-	b     	  L(label);          	    \
-	cmpldi	  r5,64;		    \
-	ble	  L(null);		    \
-	addi	  r5,r5,-64
-
-/* Implements the function
-   void *[r3] memchr (const void *s [r3], int c [r4], size_t n [r5]).  */
-
-	.machine power9
-
-ENTRY_TOCLESS (MEMCHR)
-	CALL_MCOUNT 3
-
-	cmpldi	r5,0
-	beq	L(null)
-	mr	r0,r5
-	xori	r6,r4,0xff
-
-	mtvsrd	v18+32,r4	/* matching char in v18  */
-	mtvsrd	v19+32,r6	/* non matching char in v19  */
-
-	vspltb	v18,v18,7	/* replicate  */
-	vspltb	v19,v19,7	/* replicate  */
-	vspltisb  M_VREG_ZERO,0
-
-	/* Next 16B-aligned address. Prepare address for L(aligned).  */
-	addi	  r6,r3,16
-	clrrdi	  r6,r6,4
-
-	/* Align data and fill bytes not loaded with non matching char.	 */
-	lvx	  v0,0,r3
-	lvsr	  v1,0,r3
-	vperm	  v0,v19,v0,v1
-
-	vcmpequb. v6,v0,v18
-	bne	  cr6,L(found)
-	sub	  r4,r6,r3
-	cmpld	  r5,r4
-	ble	  L(null)
-	sub	  r5,r5,r4
-
-	/* Test up to OFF_START_LOOP-16 bytes in 16B chunks.  The main loop is
-	   optimized for longer strings, so checking the first bytes in 16B
-	   chunks benefits a lot small strings.  */
-	.p2align 5
-L(aligned):
-	cmpldi	r5,0
-	beq     L(null)
-
-	CHECK16B(v0,0,r6,tail1)
-	CHECK16B(v1,16,r6,tail2)
-	CHECK16B(v2,32,r6,tail3)
-	CHECK16B(v3,48,r6,tail4)
-	CHECK16B(v4,64,r6,tail5)
-	CHECK16B(v5,80,r6,tail6)
-	CHECK16B(v6,96,r6,tail7)
-	CHECK16B(v7,112,r6,tail8)
-	CHECK16B(v8,128,r6,tail9)
-	CHECK16B(v9,144,r6,tail10)
-	CHECK16B(v10,160,r6,tail11)
-	CHECK16B(v0,176,r6,tail12)
-	CHECK16B(v1,192,r6,tail13)
-	CHECK16B(v2,208,r6,tail14)
-	CHECK16B(v3,224,r6,tail15)
-
-	cmpdi	cr5,r4,0	/* Check if c == 0.  This will be useful to
-				   choose how we will perform the main loop.  */
-
-	/* Prepare address for the loop.  */
-	addi	  r4,r3,M_OFF_START_LOOP
-	clrrdi	  r4,r4,6
-	sub	  r6,r4,r3
-	sub	  r5,r0,r6
-	addi	  r6,r4,128
-
-	/* If c == 0, use the loop without the vsububm.  */
-	beq	cr5,L(loop)
-
-	/* This is very similar to the block after L(loop), the difference is
-	   that here MEMCHR_SUBTRACT_VECTORS is not empty, and we subtract
-	   each byte loaded by the char we are looking for, this way we can keep
-	   using vminub to merge the results and checking for nulls.  */
-	.p2align 5
-L(memchr_loop):
-	CHECK64B(0,r4,pre_tail_64b)
-	CHECK64B(64,r4,pre_tail_64b)
-	addi	r4,r4,256
-
-	CHECK64B(0,r6,tail_64b)
-	CHECK64B(64,r6,tail_64b)
-	addi	r6,r6,256
-
-	CHECK64B(0,r4,pre_tail_64b)
-	CHECK64B(64,r4,pre_tail_64b)
-	addi	r4,r4,256
-
-	CHECK64B(0,r6,tail_64b)
-	CHECK64B(64,r6,tail_64b)
-	addi	r6,r6,256
-
-	b	L(memchr_loop)
-	/* Switch to a more aggressive approach checking 64B each time.  Use 2
-	   pointers 128B apart and unroll the loop once to make the pointer
-	   updates and usages separated enough to avoid stalls waiting for
-	   address calculation.  */
-	.p2align 5
-L(loop):
-#undef MEMCHR_SUBTRACT_VECTORS
-#define MEMCHR_SUBTRACT_VECTORS /* nothing */
-	CHECK64B(0,r4,pre_tail_64b)
-	CHECK64B(64,r4,pre_tail_64b)
-	addi	  r4,r4,256
-
-	CHECK64B(0,r6,tail_64b)
-	CHECK64B(64,r6,tail_64b)
-	addi	  r6,r6,256
-
-	CHECK64B(0,r4,pre_tail_64b)
-	CHECK64B(64,r4,pre_tail_64b)
-	addi      r4,r4,256
-
-	CHECK64B(0,r6,tail_64b)
-	CHECK64B(64,r6,tail_64b)
-	addi      r6,r6,256
-
-	b	  L(loop)
-
-	.p2align  5
-L(pre_tail_64b):
-	mr	r6,r4
-L(tail_64b):
-	/* OK, we found a null byte.  Let's look for it in the current 64-byte
-	   block and mark it in its corresponding VR.  lxvp vx,0(ry) puts the
-	   low 16B bytes into vx+1, and the high into vx, so the order here is
-	   v5, v4, v7, v6.  */
-	vcmpequb  v1,v5,M_VREG_ZERO
-	vcmpequb  v2,v4,M_VREG_ZERO
-	vcmpequb  v3,v7,M_VREG_ZERO
-	vcmpequb  v4,v6,M_VREG_ZERO
-
-	/* Take into account the other 64B blocks we had already checked.  */
-	add	r6,r6,r7
-	/* Extract first bit of each byte.  */
-	M_VEXTRACTBM(r8,v1)
-	M_VEXTRACTBM(r9,v2)
-	M_VEXTRACTBM(r10,v3)
-	M_VEXTRACTBM(r11,v4)
-
-	/* Shift each value into their corresponding position.  */
-	sldi	  r9,r9,16
-	sldi	  r10,r10,32
-	sldi	  r11,r11,48
-
-	/* Merge the results.  */
-	or	  r8,r8,r9
-	or	  r9,r10,r11
-	or	  r11,r9,r8
-
-	cnttzd	  r0,r11	  /* Count trailing zeros before the match.  */
-	cmpld     r5,r0
-	ble	  L(null)
-	add	  r3,r6,r0	  /* Compute final address.  */
-	blr
-
-	.p2align  5
-L(tail1):
-	M_TAIL(v0,0)
-
-	.p2align  5
-L(tail2):
-	M_TAIL(v1,16)
-
-	.p2align  5
-L(tail3):
-	M_TAIL(v2,32)
-
-	.p2align  5
-L(tail4):
-	M_TAIL(v3,48)
-
-	.p2align  5
-L(tail5):
-	M_TAIL(v4,64)
-
-	.p2align  5
-L(tail6):
-	M_TAIL(v5,80)
-
-	.p2align  5
-L(tail7):
-	M_TAIL(v6,96)
-
-	.p2align  5
-L(tail8):
-	M_TAIL(v7,112)
-
-	.p2align  5
-L(tail9):
-	M_TAIL(v8,128)
-
-	.p2align  5
-L(tail10):
-	M_TAIL(v9,144)
-
-	.p2align  5
-L(tail11):
-	M_TAIL(v10,160)
-
-	.p2align  5
-L(tail12):
-	M_TAIL(v0,176)
-
-	.p2align  5
-L(tail13):
-	M_TAIL(v1,192)
-
-	.p2align  5
-L(tail14):
-	M_TAIL(v2,208)
-
-	.p2align  5
-L(tail15):
-	M_TAIL(v3,224)
-
-	.p2align  5
-L(found):
-	vctzlsbb  r7,v6
-	cmpld     r5,r7
-	ble       L(null)
-	add       r3,r3,r7
-	blr
-
-	.p2align  5
-L(null):
-	li	r3,0
-	blr
-
-END (MEMCHR)
-
-weak_alias (__memchr, memchr)
-libc_hidden_builtin_def (memchr)
diff --git a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S b/sysdeps/powerpc/powerpc64/le/power10/strcmp.S
deleted file mode 100644
index fffa1ee0a..000000000
--- a/sysdeps/powerpc/powerpc64/le/power10/strcmp.S
+++ /dev/null
@@ -1,233 +0,0 @@
-/* Optimized strcmp implementation for PowerPC64/POWER10.
-   Copyright (C) 2021-2025 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-#include <sysdep.h>
-
-#ifndef STRCMP
-# define STRCMP strcmp
-#endif
-
-/* Implements the function
-   int [r3] strcmp (const char *s1 [r3], const char *s2 [r4]).  */
-
-/* TODO: Change this to actual instructions when minimum binutils is upgraded
-   to 2.27.  Macros are defined below for these newer instructions in order
-   to maintain compatibility.  */
-
-#define LXVP(xtp,dq,ra)		     \
-	.long(((6)<<(32-6))	     \
-	| ((((xtp)-32)>>1)<<(32-10)) \
-	| ((1)<<(32-11))	     \
-	| ((ra)<<(32-16))	     \
-	| dq)
-
-#define COMPARE_16(vreg1,vreg2,offset)  \
-	lxv       vreg1+32,offset(r3);  \
-	lxv       vreg2+32,offset(r4);	\
-	vcmpnezb. v7,vreg1,vreg2;	\
-	bne       cr6,L(different);     \
-
-#define COMPARE_32(vreg1,vreg2,offset,label1,label2) \
-	LXVP(vreg1+32,offset,r3);                    \
-	LXVP(vreg2+32,offset,r4);                    \
-	vcmpnezb. v7,vreg1+1,vreg2+1;                \
-	bne	  cr6,L(label1);                     \
-	vcmpnezb. v7,vreg1,vreg2;                    \
-	bne	  cr6,L(label2);                     \
-
-#define TAIL(vreg1,vreg2)     \
-	vctzlsbb r6,v7;	      \
-	vextubrx r5,r6,vreg1; \
-	vextubrx r4,r6,vreg2; \
-	subf	 r3,r4,r5;    \
-	blr;                  \
-
-#define CHECK_N_BYTES(reg1,reg2,len_reg) \
-	sldi	  r0,len_reg,56;         \
-	lxvl	  32+v4,reg1,r0;         \
-	lxvl	  32+v5,reg2,r0;         \
-	add	  reg1,reg1,len_reg;     \
-	add	  reg2,reg2,len_reg;     \
-	vcmpnezb  v7,v4,v5;              \
-	vctzlsbb  r6,v7;                 \
-	cmpld	  cr7,r6,len_reg;        \
-	blt	  cr7,L(different);      \
-
-	/* TODO: change this to .machine power10 when the minimum required
-	binutils allows it.  */
-
-	.machine  power9
-ENTRY_TOCLESS (STRCMP, 4)
-	andi.	r7,r3,4095
-	andi.	r8,r4,4095
-	cmpldi	cr0,r7,4096-16
-	cmpldi	cr1,r8,4096-16
-	bgt	cr0,L(crosses)
-	bgt	cr1,L(crosses)
-	COMPARE_16(v4,v5,0)
-
-L(crosses):
-	andi.	r7,r3,15
-	subfic	r7,r7,16	/* r7(nalign1) = 16 - (str1 & 15).  */
-	andi.	r9,r4,15
-	subfic	r5,r9,16	/* r5(nalign2) = 16 - (str2 & 15).  */
-	cmpld	cr7,r7,r5
-	beq	cr7,L(same_aligned)
-	blt	cr7,L(nalign1_min)
-
-	/* nalign2 is minimum and s2 pointer is aligned.  */
-	CHECK_N_BYTES(r3,r4,r5)
-	/* Are we on the 64B hunk which crosses a page?  */
-	andi.	r10,r3,63	/* Determine offset into 64B hunk.  */
-	andi.	r8,r3,15        /* The offset into the 16B hunk.  */
-	neg	r7,r3
-	andi.	r9,r7,15	/* Number of bytes after a 16B cross.  */
-	rlwinm.	r7,r7,26,0x3F	/* ((r3-4096))>>6&63.  */
-	beq	L(compare_64_pagecross)
-	mtctr	r7
-	b	L(compare_64B_unaligned)
-
-	/* nalign1 is minimum and s1 pointer is aligned.  */
-L(nalign1_min):
-	CHECK_N_BYTES(r3,r4,r7)
-	/* Are we on the 64B hunk which crosses a page?  */
-	andi.	r10,r4,63	/* Determine offset into 64B hunk.  */
-	andi.	r8,r4,15	/* The offset into the 16B hunk.  */
-	neg	r7,r4
-	andi.	r9,r7,15	/* Number of bytes after a 16B cross.  */
-	rlwinm. r7,r7,26,0x3F	/* ((r4-4096))>>6&63.  */
-	beq	L(compare_64_pagecross)
-	mtctr	r7
-
-	.p2align 5
-L(compare_64B_unaligned):
-	COMPARE_16(v4,v5,0)
-	COMPARE_16(v4,v5,16)
-	COMPARE_16(v4,v5,32)
-	COMPARE_16(v4,v5,48)
-	addi	r3,r3,64
-	addi	r4,r4,64
-	bdnz	L(compare_64B_unaligned)
-
-	/* Cross the page boundary of s2, carefully. Only for first
-	iteration we have to get the count of 64B blocks to be checked.
-	From second iteration and beyond, loop counter is always 63.  */
-L(compare_64_pagecross):
-	li	r11, 63
-	mtctr	r11
-	cmpldi	r10,16
-	ble	L(cross_4)
-	cmpldi	r10,32
-	ble	L(cross_3)
-	cmpldi	r10,48
-	ble	L(cross_2)
-L(cross_1):
-	CHECK_N_BYTES(r3,r4,r9)
-	CHECK_N_BYTES(r3,r4,r8)
-	COMPARE_16(v4,v5,0)
-	COMPARE_16(v4,v5,16)
-	COMPARE_16(v4,v5,32)
-	addi	r3,r3,48
-	addi	r4,r4,48
-	b	L(compare_64B_unaligned)
-L(cross_2):
-	COMPARE_16(v4,v5,0)
-	addi	r3,r3,16
-	addi	r4,r4,16
-	CHECK_N_BYTES(r3,r4,r9)
-	CHECK_N_BYTES(r3,r4,r8)
-	COMPARE_16(v4,v5,0)
-	COMPARE_16(v4,v5,16)
-	addi	r3,r3,32
-	addi	r4,r4,32
-	b	L(compare_64B_unaligned)
-L(cross_3):
-	COMPARE_16(v4,v5,0)
-	COMPARE_16(v4,v5,16)
-	addi	r3,r3,32
-	addi	r4,r4,32
-	CHECK_N_BYTES(r3,r4,r9)
-	CHECK_N_BYTES(r3,r4,r8)
-	COMPARE_16(v4,v5,0)
-	addi	r3,r3,16
-	addi	r4,r4,16
-	b	L(compare_64B_unaligned)
-L(cross_4):
-	COMPARE_16(v4,v5,0)
-	COMPARE_16(v4,v5,16)
-	COMPARE_16(v4,v5,32)
-	addi	r3,r3,48
-	addi	r4,r4,48
-	CHECK_N_BYTES(r3,r4,r9)
-	CHECK_N_BYTES(r3,r4,r8)
-	b	L(compare_64B_unaligned)
-
-L(same_aligned):
-	CHECK_N_BYTES(r3,r4,r7)
-        /* Align s1 to 32B and adjust s2 address.
-	   Use lxvp only if both s1 and s2 are 32B aligned.  */
-	COMPARE_16(v4,v5,0)
-	COMPARE_16(v4,v5,16)
-	COMPARE_16(v4,v5,32)
-	COMPARE_16(v4,v5,48)
-	addi	r3,r3,64
-	addi	r4,r4,64
-	COMPARE_16(v4,v5,0)
-	COMPARE_16(v4,v5,16)
-
-	clrldi	r6,r3,59
-	subfic	r5,r6,32
-	add	r3,r3,r5
-	add	r4,r4,r5
-	andi.	r5,r4,0x1F
-	beq	cr0,L(32B_aligned_loop)
-
-	.p2align 5
-L(16B_aligned_loop):
-	COMPARE_16(v4,v5,0)
-	COMPARE_16(v4,v5,16)
-	COMPARE_16(v4,v5,32)
-	COMPARE_16(v4,v5,48)
-	addi	r3,r3,64
-	addi	r4,r4,64
-	b	L(16B_aligned_loop)
-
-	/* Calculate and return the difference.  */
-L(different):
-	TAIL(v4,v5)
-
-	.p2align 5
-L(32B_aligned_loop):
-	COMPARE_32(v14,v16,0,tail1,tail2)
-	COMPARE_32(v18,v20,32,tail3,tail4)
-	COMPARE_32(v22,v24,64,tail5,tail6)
-	COMPARE_32(v26,v28,96,tail7,tail8)
-	addi	r3,r3,128
-	addi	r4,r4,128
-	b	L(32B_aligned_loop)
-
-L(tail1): TAIL(v15,v17)
-L(tail2): TAIL(v14,v16)
-L(tail3): TAIL(v19,v21)
-L(tail4): TAIL(v18,v20)
-L(tail5): TAIL(v23,v25)
-L(tail6): TAIL(v22,v24)
-L(tail7): TAIL(v27,v29)
-L(tail8): TAIL(v26,v28)
-
-END (STRCMP)
-libc_hidden_builtin_def (strcmp)
diff --git a/sysdeps/powerpc/powerpc64/le/power10/strncmp.S b/sysdeps/powerpc/powerpc64/le/power10/strncmp.S
deleted file mode 100644
index 10700dd40..000000000
--- a/sysdeps/powerpc/powerpc64/le/power10/strncmp.S
+++ /dev/null
@@ -1,271 +0,0 @@
-/* Optimized strncmp implementation for PowerPC64/POWER10.
-   Copyright (C) 2024-2025 Free Software Foundation, Inc.
-   This file is part of the GNU C Library.
-
-   The GNU C Library is free software; you can redistribute it and/or
-   modify it under the terms of the GNU Lesser General Public
-   License as published by the Free Software Foundation; either
-   version 2.1 of the License, or (at your option) any later version.
-
-   The GNU C Library is distributed in the hope that it will be useful,
-   but WITHOUT ANY WARRANTY; without even the implied warranty of
-   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
-   Lesser General Public License for more details.
-
-   You should have received a copy of the GNU Lesser General Public
-   License along with the GNU C Library; if not, see
-   <https://www.gnu.org/licenses/>.  */
-
-#include <sysdep.h>
-
-/* Implements the function
-
-   int [r3] strncmp (const char *s1 [r3], const char *s2 [r4], size_t [r5] n)
-
-   The implementation uses unaligned doubleword access to avoid specialized
-   code paths depending of data alignment for first 32 bytes and uses
-   vectorised loops after that.  */
-
-#ifndef STRNCMP
-# define STRNCMP strncmp
-#endif
-
-/* TODO: Change this to actual instructions when minimum binutils is upgraded
-   to 2.27.  Macros are defined below for these newer instructions in order
-   to maintain compatibility.  */
-
-#define LXVP(xtp,dq,ra)              \
-	.long(((6)<<(32-6))          \
-	| ((((xtp)-32)>>1)<<(32-10)) \
-	| ((1)<<(32-11))             \
-	| ((ra)<<(32-16))            \
-	| dq)
-
-#define COMPARE_16(vreg1,vreg2,offset) \
-	lxv	  vreg1+32,offset(r3); \
-	lxv	  vreg2+32,offset(r4); \
-	vcmpnezb. v7,vreg1,vreg2;      \
-	bne	  cr6,L(different);    \
-	cmpldi	  cr7,r5,16;           \
-	ble	  cr7,L(ret0);         \
-	addi	  r5,r5,-16;
-
-#define COMPARE_32(vreg1,vreg2,offset,label1,label2) \
-	LXVP(vreg1+32,offset,r3);                    \
-	LXVP(vreg2+32,offset,r4);                    \
-	vcmpnezb. v7,vreg1+1,vreg2+1;                \
-	bne	  cr6,L(label1);                     \
-	vcmpnezb. v7,vreg1,vreg2;                    \
-	bne	  cr6,L(label2);                     \
-	cmpldi	  cr7,r5,32;                         \
-	ble	  cr7,L(ret0);                       \
-	addi	  r5,r5,-32;
-
-#define TAIL_FIRST_16B(vreg1,vreg2) \
-	vctzlsbb r6,v7;             \
-	cmpld	 cr7,r5,r6;         \
-	ble	 cr7,L(ret0);       \
-	vextubrx r5,r6,vreg1;       \
-	vextubrx r4,r6,vreg2;       \
-	subf	 r3,r4,r5;          \
-	blr;
-
-#define TAIL_SECOND_16B(vreg1,vreg2) \
-	vctzlsbb r6,v7;              \
-	addi	 r0,r6,16;           \
-	cmpld	 cr7,r5,r0;          \
-	ble	 cr7,L(ret0);        \
-	vextubrx r5,r6,vreg1;        \
-	vextubrx r4,r6,vreg2;        \
-	subf	 r3,r4,r5;           \
-	blr;
-
-#define CHECK_N_BYTES(reg1,reg2,len_reg) \
-	sldi	  r6,len_reg,56;	 \
-	lxvl	  32+v4,reg1,r6;	 \
-	lxvl	  32+v5,reg2,r6;	 \
-	add	  reg1,reg1,len_reg;	 \
-	add	  reg2,reg2,len_reg;	 \
-	vcmpnezb  v7,v4,v5;		 \
-	vctzlsbb  r6,v7;		 \
-	cmpld	  cr7,r6,len_reg;	 \
-	blt	  cr7,L(different);	 \
-	cmpld	  cr7,r5,len_reg;	 \
-	ble	  cr7,L(ret0);		 \
-	sub	  r5,r5,len_reg;	 \
-
-	/* TODO: change this to .machine power10 when the minimum required
-	 binutils allows it.  */
-	.machine  power9
-ENTRY_TOCLESS (STRNCMP, 4)
-	/* Check if size is 0.  */
-	cmpdi	 cr0,r5,0
-	beq	 cr0,L(ret0)
-	andi.   r7,r3,4095
-	andi.   r8,r4,4095
-	cmpldi  cr0,r7,4096-16
-	cmpldi  cr1,r8,4096-16
-	bgt     cr0,L(crosses)
-	bgt     cr1,L(crosses)
-	COMPARE_16(v4,v5,0)
-	addi	r3,r3,16
-	addi	r4,r4,16
-
-L(crosses):
-	andi.	 r7,r3,15
-	subfic	 r7,r7,16	/* r7(nalign1) = 16 - (str1 & 15).  */
-	andi.	 r9,r4,15
-	subfic	 r8,r9,16	/* r8(nalign2) = 16 - (str2 & 15).  */
-	cmpld	 cr7,r7,r8
-	beq	 cr7,L(same_aligned)
-	blt	 cr7,L(nalign1_min)
-
-	/* nalign2 is minimum and s2 pointer is aligned.  */
-	CHECK_N_BYTES(r3,r4,r8)
-	/* Are we on the 64B hunk which crosses a page?  */
-	andi.   r10,r3,63       /* Determine offset into 64B hunk.  */
-	andi.   r8,r3,15        /* The offset into the 16B hunk.  */
-	neg     r7,r3
-	andi.   r9,r7,15        /* Number of bytes after a 16B cross.  */
-	rlwinm. r7,r7,26,0x3F   /* ((r4-4096))>>6&63.  */
-	beq     L(compare_64_pagecross)
-	mtctr   r7
-	b       L(compare_64B_unaligned)
-
-	/* nalign1 is minimum and s1 pointer is aligned.  */
-L(nalign1_min):
-	CHECK_N_BYTES(r3,r4,r7)
-	/* Are we on the 64B hunk which crosses a page?  */
-	andi.   r10,r4,63       /* Determine offset into 64B hunk.  */
-	andi.   r8,r4,15        /* The offset into the 16B hunk.  */
-	neg     r7,r4
-	andi.   r9,r7,15        /* Number of bytes after a 16B cross.  */
-	rlwinm. r7,r7,26,0x3F   /* ((r4-4096))>>6&63.  */
-	beq     L(compare_64_pagecross)
-	mtctr   r7
-
-	.p2align 5
-L(compare_64B_unaligned):
-	COMPARE_16(v4,v5,0)
-	COMPARE_16(v4,v5,16)
-	COMPARE_16(v4,v5,32)
-	COMPARE_16(v4,v5,48)
-	addi    r3,r3,64
-	addi    r4,r4,64
-	bdnz    L(compare_64B_unaligned)
-
-	/* Cross the page boundary of s2, carefully. Only for first
-	iteration we have to get the count of 64B blocks to be checked.
-	From second iteration and beyond, loop counter is always 63.  */
-L(compare_64_pagecross):
-	li      r11, 63
-	mtctr   r11
-	cmpldi  r10,16
-	ble     L(cross_4)
-	cmpldi  r10,32
-	ble     L(cross_3)
-	cmpldi  r10,48
-	ble     L(cross_2)
-L(cross_1):
-	CHECK_N_BYTES(r3,r4,r9)
-	CHECK_N_BYTES(r3,r4,r8)
-	COMPARE_16(v4,v5,0)
-	COMPARE_16(v4,v5,16)
-	COMPARE_16(v4,v5,32)
-	addi    r3,r3,48
-	addi    r4,r4,48
-	b       L(compare_64B_unaligned)
-L(cross_2):
-	COMPARE_16(v4,v5,0)
-	addi    r3,r3,16
-	addi    r4,r4,16
-	CHECK_N_BYTES(r3,r4,r9)
-	CHECK_N_BYTES(r3,r4,r8)
-	COMPARE_16(v4,v5,0)
-	COMPARE_16(v4,v5,16)
-	addi    r3,r3,32
-	addi    r4,r4,32
-	b       L(compare_64B_unaligned)
-L(cross_3):
-	COMPARE_16(v4,v5,0)
-	COMPARE_16(v4,v5,16)
-	addi    r3,r3,32
-	addi    r4,r4,32
-	CHECK_N_BYTES(r3,r4,r9)
-	CHECK_N_BYTES(r3,r4,r8)
-	COMPARE_16(v4,v5,0)
-	addi    r3,r3,16
-	addi    r4,r4,16
-	b       L(compare_64B_unaligned)
-L(cross_4):
-	COMPARE_16(v4,v5,0)
-	COMPARE_16(v4,v5,16)
-	COMPARE_16(v4,v5,32)
-	addi    r3,r3,48
-	addi    r4,r4,48
-	CHECK_N_BYTES(r3,r4,r9)
-	CHECK_N_BYTES(r3,r4,r8)
-	b       L(compare_64B_unaligned)
-
-L(same_aligned):
-	CHECK_N_BYTES(r3,r4,r7)
-	/* Align s1 to 32B and adjust s2 address.
-	   Use lxvp only if both s1 and s2 are 32B aligned.  */
-	COMPARE_16(v4,v5,0)
-	COMPARE_16(v4,v5,16)
-	COMPARE_16(v4,v5,32)
-	COMPARE_16(v4,v5,48)
-	addi	r3,r3,64
-	addi	r4,r4,64
-	COMPARE_16(v4,v5,0)
-	COMPARE_16(v4,v5,16)
-	addi	r5,r5,32
-
-	clrldi  r6,r3,59
-	subfic	r7,r6,32
-	add	r3,r3,r7
-	add	r4,r4,r7
-	subf	r5,r7,r5
-	andi.	r7,r4,0x1F
-	beq	cr0,L(32B_aligned_loop)
-
-	.p2align 5
-L(16B_aligned_loop):
-	COMPARE_16(v4,v5,0)
-	COMPARE_16(v4,v5,16)
-	COMPARE_16(v4,v5,32)
-	COMPARE_16(v4,v5,48)
-	addi	r3,r3,64
-	addi	r4,r4,64
-	b	L(16B_aligned_loop)
-
-	/* Calculate and return the difference.  */
-L(different):
-	TAIL_FIRST_16B(v4,v5)
-
-	.p2align 5
-L(32B_aligned_loop):
-	COMPARE_32(v14,v16,0,tail1,tail2)
-	COMPARE_32(v18,v20,32,tail3,tail4)
-	COMPARE_32(v22,v24,64,tail5,tail6)
-	COMPARE_32(v26,v28,96,tail7,tail8)
-	addi	r3,r3,128
-	addi	r4,r4,128
-	b	L(32B_aligned_loop)
-
-L(tail1): TAIL_FIRST_16B(v15,v17)
-L(tail2): TAIL_SECOND_16B(v14,v16)
-L(tail3): TAIL_FIRST_16B(v19,v21)
-L(tail4): TAIL_SECOND_16B(v18,v20)
-L(tail5): TAIL_FIRST_16B(v23,v25)
-L(tail6): TAIL_SECOND_16B(v22,v24)
-L(tail7): TAIL_FIRST_16B(v27,v29)
-L(tail8): TAIL_SECOND_16B(v26,v28)
-
-	.p2align 5
-L(ret0):
-	li	r3,0
-	blr
-
-END(STRNCMP)
-libc_hidden_builtin_def(strncmp)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/Makefile b/sysdeps/powerpc/powerpc64/multiarch/Makefile
index dc7c5b14e..142e6c24c 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/Makefile
+++ b/sysdeps/powerpc/powerpc64/multiarch/Makefile
@@ -31,12 +31,11 @@ sysdep_routines += memcpy-power8-cached memcpy-power7 memcpy-a2 memcpy-power6 \
 		   strncase-power8
 
 ifneq (,$(filter %le,$(config-machine)))
-sysdep_routines += memchr-power10 memcmp-power10 memcpy-power10 \
-		   memmove-power10 memset-power10 rawmemchr-power9 \
-		   rawmemchr-power10 strcmp-power9 strcmp-power10 \
-		   strncmp-power9 strncmp-power10 strcpy-power9 strcat-power10 \
-		   stpcpy-power9 strlen-power9 strncpy-power9 stpncpy-power9 \
-		   strlen-power10
+sysdep_routines += memcmp-power10 memcpy-power10 memmove-power10 memset-power10 \
+		   rawmemchr-power9 rawmemchr-power10 \
+		   strcmp-power9 strncmp-power9 \
+		   strcpy-power9 strcat-power10 stpcpy-power9 \
+		   strlen-power9 strncpy-power9 stpncpy-power9 strlen-power10
 endif
 CFLAGS-strncase-power7.c += -mcpu=power7 -funroll-loops
 CFLAGS-strncase_l-power7.c += -mcpu=power7 -funroll-loops
diff --git a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
index 0a31a5853..de288a0d8 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/ifunc-impl-list.c
@@ -164,9 +164,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/strncmp.c.  */
   IFUNC_IMPL (i, name, strncmp,
 #ifdef __LITTLE_ENDIAN__
-	      IFUNC_IMPL_ADD (array, i, strncmp, hwcap2 & PPC_FEATURE2_ARCH_3_1
-			      && hwcap & PPC_FEATURE_HAS_VSX,
-			      __strncmp_power10)
 	      IFUNC_IMPL_ADD (array, i, strncmp, hwcap2 & PPC_FEATURE2_ARCH_3_00
 			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
 			      __strncmp_power9)
@@ -229,12 +226,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 
   /* Support sysdeps/powerpc/powerpc64/multiarch/memchr.c.  */
   IFUNC_IMPL (i, name, memchr,
-#ifdef __LITTLE_ENDIAN__
-	      IFUNC_IMPL_ADD (array, i, memchr,
-		              hwcap2 & PPC_FEATURE2_ARCH_3_1
-			      && hwcap & PPC_FEATURE_HAS_VSX,
-			      __memchr_power10)
-#endif
 	      IFUNC_IMPL_ADD (array, i, memchr,
 			      hwcap2 & PPC_FEATURE2_ARCH_2_07
 			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
@@ -386,10 +377,6 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
   /* Support sysdeps/powerpc/powerpc64/multiarch/strcmp.c.  */
   IFUNC_IMPL (i, name, strcmp,
 #ifdef __LITTLE_ENDIAN__
-	      IFUNC_IMPL_ADD (array, i, strcmp,
-			      (hwcap2 & PPC_FEATURE2_ARCH_3_1)
-			      && (hwcap & PPC_FEATURE_HAS_VSX),
-			      __strcmp_power10)
 	      IFUNC_IMPL_ADD (array, i, strcmp,
 			      hwcap2 & PPC_FEATURE2_ARCH_3_00
 			      && hwcap & PPC_FEATURE_HAS_ALTIVEC,
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memchr.c b/sysdeps/powerpc/powerpc64/multiarch/memchr.c
index b63c7968c..3abd64aed 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memchr.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/memchr.c
@@ -25,23 +25,15 @@ extern __typeof (__memchr) __memchr_ppc attribute_hidden;
 extern __typeof (__memchr) __memchr_power7 attribute_hidden;
 extern __typeof (__memchr) __memchr_power8 attribute_hidden;
 
-# ifdef __LITTLE_ENDIAN__
-extern __typeof (__memchr) __memchr_power10 attribute_hidden;
-# endif
 /* Avoid DWARF definition DIE on ifunc symbol so that GDB can handle
    ifunc symbol properly.  */
 libc_ifunc (__memchr,
-# ifdef __LITTLE_ENDIAN__
-	    (hwcap2 & PPC_FEATURE2_ARCH_3_1
-	     && hwcap & PPC_FEATURE_HAS_VSX)
-	    ? __memchr_power10 :
-# endif
-	      (hwcap2 & PPC_FEATURE2_ARCH_2_07
-	      && hwcap & PPC_FEATURE_HAS_ALTIVEC)
-	      ? __memchr_power8 :
-	        (hwcap & PPC_FEATURE_ARCH_2_06)
-	        ? __memchr_power7
-	        : __memchr_ppc);
+	    (hwcap2 & PPC_FEATURE2_ARCH_2_07
+	     && hwcap & PPC_FEATURE_HAS_ALTIVEC)
+	    ? __memchr_power8 :
+	    (hwcap & PPC_FEATURE_ARCH_2_06)
+            ? __memchr_power7
+            : __memchr_ppc);
 
 weak_alias (__memchr, memchr)
 libc_hidden_builtin_def (memchr)
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
index 3c636e3bb..7c77c084a 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strcmp.c
@@ -29,16 +29,12 @@ extern __typeof (strcmp) __strcmp_power7 attribute_hidden;
 extern __typeof (strcmp) __strcmp_power8 attribute_hidden;
 # ifdef __LITTLE_ENDIAN__
 extern __typeof (strcmp) __strcmp_power9 attribute_hidden;
-extern __typeof (strcmp) __strcmp_power10 attribute_hidden;
 # endif
 
 # undef strcmp
 
 libc_ifunc_redirected (__redirect_strcmp, strcmp,
 # ifdef __LITTLE_ENDIAN__
-		        (hwcap2 & PPC_FEATURE2_ARCH_3_1
-			 && hwcap & PPC_FEATURE_HAS_VSX)
-			? __strcmp_power10 :
 			(hwcap2 & PPC_FEATURE2_ARCH_3_00
 			 && hwcap & PPC_FEATURE_HAS_ALTIVEC)
 			? __strcmp_power9 :
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncmp.c b/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
index 0a664a620..4cfe27fa4 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
+++ b/sysdeps/powerpc/powerpc64/multiarch/strncmp.c
@@ -29,7 +29,6 @@ extern __typeof (strncmp) __strncmp_ppc attribute_hidden;
 extern __typeof (strncmp) __strncmp_power8 attribute_hidden;
 # ifdef __LITTLE_ENDIAN__
 extern __typeof (strncmp) __strncmp_power9 attribute_hidden;
-extern __typeof (strncmp) __strncmp_power10 attribute_hidden;
 # endif
 # undef strncmp
 
@@ -37,9 +36,6 @@ extern __typeof (strncmp) __strncmp_power10 attribute_hidden;
    ifunc symbol properly.  */
 libc_ifunc_redirected (__redirect_strncmp, strncmp,
 # ifdef __LITTLE_ENDIAN__
-			(hwcap2 & PPC_FEATURE2_ARCH_3_1
-			 && hwcap & PPC_FEATURE_HAS_VSX)
-			? __strncmp_power10 :
 			(hwcap2 & PPC_FEATURE2_ARCH_3_00
 			 && hwcap & PPC_FEATURE_HAS_ALTIVEC)
 			? __strncmp_power9 :
diff --git a/sysdeps/pthread/Makefile b/sysdeps/pthread/Makefile
index a123e28a5..7fcbc72bc 100644
--- a/sysdeps/pthread/Makefile
+++ b/sysdeps/pthread/Makefile
@@ -106,6 +106,7 @@ tests += \
   tst-cancel28 \
   tst-cancel29 \
   tst-cancel30 \
+  tst-cancel32 \
   tst-cleanup0 \
   tst-cleanup1 \
   tst-cleanup2 \
@@ -271,6 +272,7 @@ tests += \
   tst-spin4 \
   tst-spin5 \
   tst-stack1 \
+  tst-stack2 \
   tst-stdio1 \
   tst-stdio2 \
   tst-thrd-detach \
@@ -366,6 +368,7 @@ modules-names += \
   tst-atfork4mod \
   tst-create1mod \
   tst-fini1mod \
+  tst-stack2-mod \
   tst-tls4moda \
   tst-tls4modb \
   # modules-names
@@ -539,4 +542,12 @@ LDFLAGS-tst-create1 = -Wl,-export-dynamic
 $(objpfx)tst-create1: $(shared-thread-library)
 $(objpfx)tst-create1.out: $(objpfx)tst-create1mod.so
 
+$(objpfx)tst-stack2.out: $(objpfx)tst-stack2-mod.so
+$(objpfx)tst-stack2-mod.so: $(shared-thread-library)
+LDFLAGS-tst-stack2-mod.so = -Wl,-z,execstack
+ifeq ($(have-no-error-execstack),yes)
+LDFLAGS-tst-stack2-mod.so += -Wl,--no-error-execstack
+endif
+tst-stack2-ENV = GLIBC_TUNABLES=glibc.rtld.execstack=2
+
 endif
diff --git a/sysdeps/pthread/tst-cancel32.c b/sysdeps/pthread/tst-cancel32.c
new file mode 100644
index 000000000..ab550c16b
--- /dev/null
+++ b/sysdeps/pthread/tst-cancel32.c
@@ -0,0 +1,73 @@
+/* Check if pthread_setcanceltype disables asynchronous cancellation
+   once cancellation happens (BZ 32782)
+
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+/* The pthread_setcanceltype is a cancellation entrypoint, and if
+   asynchronous is enabled and the cancellation starts (on the second
+   pthread_setcanceltype call), the asynchronous should not restart
+   the process.  */
+
+#include <support/xthread.h>
+
+#define NITER     1000
+#define NTHREADS     8
+
+static void
+tf_cleanup (void *arg)
+{
+}
+
+static void *
+tf (void *closure)
+{
+  pthread_cleanup_push (tf_cleanup, NULL);
+  for (;;)
+    {
+      /* The only possible failure for pthread_setcanceltype is an
+	 invalid state type.  */
+      pthread_setcanceltype (PTHREAD_CANCEL_ASYNCHRONOUS, NULL);
+      pthread_setcanceltype (PTHREAD_CANCEL_DEFERRED, NULL);
+    }
+  pthread_cleanup_pop (1);
+
+  return NULL;
+}
+
+static void
+poll_threads (int nthreads)
+{
+  pthread_t thr[nthreads];
+  for (int i = 0; i < nthreads; i++)
+    thr[i] = xpthread_create (NULL, tf, NULL);
+  for (int i = 0; i < nthreads; i++)
+    xpthread_cancel (thr[i]);
+  for (int i = 0; i < nthreads; i++)
+    xpthread_join (thr[i]);
+}
+
+static int
+do_test (void)
+{
+  for (int k = 0; k < NITER; k++)
+    poll_threads (NTHREADS);
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/pthread/tst-stack2-mod.c b/sysdeps/pthread/tst-stack2-mod.c
new file mode 100644
index 000000000..806fdbcd8
--- /dev/null
+++ b/sysdeps/pthread/tst-stack2-mod.c
@@ -0,0 +1,39 @@
+/* Check if pthread_getattr_np works within modules with non-exectuble
+   stacks (BZ 32897).
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <pthread.h>
+
+bool init_result;
+
+void
+__attribute__ ((constructor))
+init (void)
+{
+  pthread_t me = pthread_self ();
+  pthread_attr_t attr;
+  init_result = pthread_getattr_np (me, &attr) == 0;
+}
+
+int
+mod_func (void)
+{
+  pthread_t me = pthread_self ();
+  pthread_attr_t attr;
+  return pthread_getattr_np (me, &attr);
+}
diff --git a/sysdeps/pthread/tst-stack2.c b/sysdeps/pthread/tst-stack2.c
new file mode 100644
index 000000000..20ab5af16
--- /dev/null
+++ b/sysdeps/pthread/tst-stack2.c
@@ -0,0 +1,47 @@
+/* Check if pthread_getattr_np works within modules with non-exectuble
+   stacks (BZ 32897).
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <pthread.h>
+#include <stdbool.h>
+#include <support/xdlfcn.h>
+#include <support/check.h>
+
+static int
+do_test (void)
+{
+  {
+    pthread_t me = pthread_self ();
+    pthread_attr_t attr;
+    TEST_COMPARE (pthread_getattr_np (me, &attr), 0);
+  }
+
+  void *h = xdlopen ("tst-stack2-mod.so", RTLD_NOW);
+
+  bool *init_result = xdlsym (h, "init_result");
+  TEST_COMPARE (*init_result, true);
+
+  int (*mod_func)(void) = xdlsym (h, "mod_func");
+  TEST_COMPARE (mod_func (), 0);
+
+  xdlclose (h);
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/riscv/dl-machine.h b/sysdeps/riscv/dl-machine.h
index a30892f08..dcc3e0883 100644
--- a/sysdeps/riscv/dl-machine.h
+++ b/sysdeps/riscv/dl-machine.h
@@ -348,7 +348,8 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
       gotplt[1] = (ElfW(Addr)) l;
     }
 
-  if (l->l_type == lt_executable && l->l_relocated)
+#ifdef SHARED
+  if (l->l_type == lt_executable)
     {
       /* The __global_pointer$ may not be defined by the linker if the
 	 $gp register does not be used to access the global variable
@@ -362,12 +363,16 @@ elf_machine_runtime_setup (struct link_map *l, struct r_scope_elem *scope[],
       _dl_lookup_symbol_x ("__global_pointer$", l, &ref,
 			   l->l_scope, NULL, 0, 0, NULL);
       if (ref)
-        asm (
-          "mv gp, %0\n"
-          :
-          : "r" (ref->st_value)
-        );
+	asm (
+	  "mv gp, %0\n"
+	  :
+	  : "r" (ref->st_value + l->l_addr)
+	  /* Don't use SYMBOL_ADDRESS here since __global_pointer$
+	     can be SHN_ABS type, but we need the address relative to
+	     PC, not the absolute address.  */
+	);
     }
+#endif
 #endif
   return lazy;
 }
diff --git a/sysdeps/sparc/sparc32/start.S b/sysdeps/sparc/sparc32/start.S
index 694b020ce..8393760da 100644
--- a/sysdeps/sparc/sparc32/start.S
+++ b/sysdeps/sparc/sparc32/start.S
@@ -35,6 +35,7 @@
 
 #include <sysdep.h>
 
+#define FRAME_SIZE 104
 
 	.section ".text"
 	.align 4
@@ -48,12 +49,12 @@ _start:
   /* Terminate the stack frame, and reserve space for functions to
      drop their arguments.  */
 	mov	%g0, %fp
-	sub	%sp, 6*4, %sp
+	sub	%sp, FRAME_SIZE, %sp
 
   /* Extract the arguments and environment as encoded on the stack.  The
      argument info starts after one register window (16 words) past the SP.  */
-	ld	[%sp+22*4], %o1
-	add	%sp, 23*4, %o2
+	ld	[%sp+168], %o1
+	add	%sp, 172, %o2
 
   /* Load the addresses of the user entry points.  */
 #ifndef PIC
@@ -73,6 +74,10 @@ _start:
      be NULL.  */
 	mov	%g1, %o5
 
+  /* Provide the highest stack address to update the __libc_stack_end (used
+     to enable executable stacks if required).  */
+	st	%sp, [%sp+23*4]
+
   /* Let libc do the rest of the initialization, and call main.  */
 	call	__libc_start_main
 	 nop
diff --git a/sysdeps/sparc/sparc64/start.S b/sysdeps/sparc/sparc64/start.S
index c9c25c2e4..08e1e7721 100644
--- a/sysdeps/sparc/sparc64/start.S
+++ b/sysdeps/sparc/sparc64/start.S
@@ -74,6 +74,10 @@ _start:
      be NULL.  */
 	mov     %g1, %o5
 
+  /* Provide the highest stack address to update the __libc_stack_end (used
+     to enable executable stacks if required).  */
+	stx	%sp, [%sp+STACK_BIAS+22*8]
+
   /* Let libc do the rest of the initialization, and call main.  */
 	call    __libc_start_main
 	 nop
diff --git a/sysdeps/unix/sysv/linux/aarch64/Makefile b/sysdeps/unix/sysv/linux/aarch64/Makefile
index 1fdad67fa..0839f0b08 100644
--- a/sysdeps/unix/sysv/linux/aarch64/Makefile
+++ b/sysdeps/unix/sysv/linux/aarch64/Makefile
@@ -3,7 +3,134 @@ sysdep_headers += sys/elf.h
 tests += \
   tst-aarch64-pkey \
   # tests
-endif
+
+ifneq (no,$(findstring no,$(have-cc-gcs) $(have-test-cc-gcs) $(have-ld-gcs)))
+
+gcs-tests-dynamic = \
+  tst-gcs-disabled \
+  tst-gcs-dlopen-disabled \
+  tst-gcs-dlopen-enforced \
+  tst-gcs-dlopen-optional-off \
+  tst-gcs-dlopen-optional-on \
+  tst-gcs-dlopen-override \
+  tst-gcs-enforced \
+  tst-gcs-enforced-abort \
+  tst-gcs-noreturn \
+  tst-gcs-optional-off \
+  tst-gcs-optional-on \
+  tst-gcs-override \
+  tst-gcs-shared-disabled \
+  tst-gcs-shared-enforced-abort \
+  tst-gcs-shared-optional \
+  tst-gcs-shared-override \
+  # gcs-tests-dynamic
+
+gcs-tests-static = \
+  tst-gcs-disabled-static \
+  tst-gcs-enforced-static \
+  tst-gcs-enforced-static-abort \
+  tst-gcs-optional-static-off \
+  tst-gcs-optional-static-on \
+  tst-gcs-override-static \
+  # gcs-tests-static
+
+tests += \
+  $(gcs-tests-dynamic) \
+  $(gcs-tests-static) \
+  # tests
+
+tests-static += \
+  $(gcs-tests-static) \
+  # tests-static
+
+define run-gcs-abort-test
+  $(test-wrapper-env) $(run-program-env) \
+  $(tst-gcs-$*-abort-ENV) $(host-test-program-cmd)
+endef
+
+$(objpfx)tst-gcs-%-abort.out: $(..)sysdeps/unix/sysv/linux/aarch64/tst-gcs-abort.sh \
+	$(objpfx)tst-gcs-%-abort
+	$(SHELL) $< $(common-objpfx) $(test-name) '$(run-gcs-abort-test)'; \
+	$(evaluate-test)
+
+LDFLAGS-tst-gcs-disabled += -Wl,-z gcs=always
+LDFLAGS-tst-gcs-enforced += -Wl,-z gcs=always
+LDFLAGS-tst-gcs-enforced-abort += -Wl,-z gcs=never
+LDFLAGS-tst-gcs-optional-on += -Wl,-z gcs=always
+LDFLAGS-tst-gcs-optional-off += -Wl,-z gcs=never
+LDFLAGS-tst-gcs-override += -Wl,-z gcs=never
+
+LDFLAGS-tst-gcs-disabled-static += -Wl,-z gcs=always
+LDFLAGS-tst-gcs-enforced-static += -Wl,-z gcs=always
+LDFLAGS-tst-gcs-enforced-static-abort += -Wl,-z gcs=never
+LDFLAGS-tst-gcs-optional-static-on += -Wl,-z gcs=always
+LDFLAGS-tst-gcs-optional-static-off += -Wl,-z gcs=never
+LDFLAGS-tst-gcs-override-static += -Wl,-z gcs=never
+
+tst-gcs-disabled-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=0
+tst-gcs-enforced-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=1
+tst-gcs-enforced-abort-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=1
+tst-gcs-optional-on-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=2
+tst-gcs-optional-off-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=2
+tst-gcs-override-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=3
+
+tst-gcs-disabled-static-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=0
+tst-gcs-enforced-static-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=1
+tst-gcs-enforced-static-abort-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=1
+tst-gcs-optional-static-on-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=2
+tst-gcs-optional-static-off-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=2
+tst-gcs-override-static-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=3
+
+# force one of the dependencies to be unmarked
+LDFLAGS-tst-gcs-mod2.so += -Wl,-z gcs=never
+
+LDFLAGS-tst-gcs-shared-disabled = -Wl,-z gcs=always
+LDFLAGS-tst-gcs-shared-enforced-abort = -Wl,-z gcs=always
+LDFLAGS-tst-gcs-shared-optional = -Wl,-z gcs=always
+LDFLAGS-tst-gcs-shared-override = -Wl,-z gcs=always
+
+modules-names += \
+  tst-gcs-mod1 \
+  tst-gcs-mod2 \
+  tst-gcs-mod3 \
+  # modules-names
+
+$(objpfx)tst-gcs-shared-disabled: $(objpfx)tst-gcs-mod1.so $(objpfx)tst-gcs-mod3.so
+$(objpfx)tst-gcs-shared-enforced-abort: $(objpfx)tst-gcs-mod1.so $(objpfx)tst-gcs-mod3.so
+$(objpfx)tst-gcs-shared-optional: $(objpfx)tst-gcs-mod1.so $(objpfx)tst-gcs-mod3.so
+$(objpfx)tst-gcs-shared-override: $(objpfx)tst-gcs-mod1.so $(objpfx)tst-gcs-mod3.so
+$(objpfx)tst-gcs-mod1.so: $(objpfx)tst-gcs-mod2.so
+
+tst-gcs-shared-disabled-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=0
+tst-gcs-shared-enforced-abort-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=1
+tst-gcs-shared-optional-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=2
+tst-gcs-shared-override-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=3
+
+LDFLAGS-tst-gcs-dlopen-disabled = -Wl,-z gcs=always
+LDFLAGS-tst-gcs-dlopen-enforced = -Wl,-z gcs=always
+LDFLAGS-tst-gcs-dlopen-optional-on = -Wl,-z gcs=always
+LDFLAGS-tst-gcs-dlopen-optional-off = -Wl,-z gcs=never
+LDFLAGS-tst-gcs-dlopen-override = -Wl,-z gcs=always
+
+tst-gcs-dlopen-disabled-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=0
+tst-gcs-dlopen-enforced-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=1
+tst-gcs-dlopen-optional-on-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=2
+tst-gcs-dlopen-optional-off-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=2
+tst-gcs-dlopen-override-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=3
+
+$(objpfx)tst-gcs-dlopen-disabled.out: $(objpfx)tst-gcs-mod2.so
+$(objpfx)tst-gcs-dlopen-enforced.out: $(objpfx)tst-gcs-mod2.so
+$(objpfx)tst-gcs-dlopen-optional-on.out: $(objpfx)tst-gcs-mod2.so
+$(objpfx)tst-gcs-dlopen-optional-off.out: $(objpfx)tst-gcs-mod2.so
+$(objpfx)tst-gcs-dlopen-override.out: $(objpfx)tst-gcs-mod2.so
+
+LDFLAGS-tst-gcs-noreturn = -Wl,-z gcs=always
+
+tst-gcs-noreturn-ENV = GLIBC_TUNABLES=glibc.cpu.aarch64_gcs=0
+
+endif # ifeq ($(have-test-cc-gcs),yes)
+
+endif # ifeq ($(subdir),misc)
 
 ifeq ($(subdir),stdlib)
 gen-as-const-headers += ucontext_i.sym
diff --git a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
index 6d63c8a9e..1acc82d07 100644
--- a/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
+++ b/sysdeps/unix/sysv/linux/aarch64/cpu-features.c
@@ -23,6 +23,7 @@
 #include <sys/prctl.h>
 #include <sys/utsname.h>
 #include <dl-tunables-parse.h>
+#include <dl-symbol-redir-ifunc.h>
 
 #define DCZID_DZP_MASK (1 << 4)
 #define DCZID_BS_MASK (0xf)
diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-aarch64-pkey.c b/sysdeps/unix/sysv/linux/aarch64/tst-aarch64-pkey.c
index 3ff33ef72..c884efc3b 100644
--- a/sysdeps/unix/sysv/linux/aarch64/tst-aarch64-pkey.c
+++ b/sysdeps/unix/sysv/linux/aarch64/tst-aarch64-pkey.c
@@ -55,6 +55,10 @@ do_test (void)
       if (errno == ENOSYS || errno == EINVAL)
         FAIL_UNSUPPORTED
           ("kernel or CPU does not support memory protection keys");
+      if (errno == ENOSPC)
+        FAIL_UNSUPPORTED
+          ("no keys available or kernel does not support memory"
+           " protection keys");
       FAIL_EXIT1 ("pkey_alloc: %m");
     }
 
diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-abort.sh b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-abort.sh
new file mode 100644
index 000000000..9e2be2d5c
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-abort.sh
@@ -0,0 +1,39 @@
+#!/bin/sh
+# Test wrapper for AArch64 tests for GCS that are expected to abort.
+# Copyright (C) 2025 Free Software Foundation, Inc.
+# This file is part of the GNU C Library.
+
+# The GNU C Library is free software; you can redistribute it and/or
+# modify it under the terms of the GNU Lesser General Public
+# License as published by the Free Software Foundation; either
+# version 2.1 of the License, or (at your option) any later version.
+
+# The GNU C Library is distributed in the hope that it will be useful,
+# but WITHOUT ANY WARRANTY; without even the implied warranty of
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+# Lesser General Public License for more details.
+
+# You should have received a copy of the GNU Lesser General Public
+# License along with the GNU C Library; if not, see
+# <https://www.gnu.org/licenses/>.
+
+objpfx=$1; shift
+tstname=$1; shift
+tstrun=$1; shift
+
+logfile=$objpfx/$tstname.out
+
+rm -vf $logfile
+touch $logfile
+
+${tstrun} 2>> $logfile >> $logfile; status=$?
+
+if test $status -eq 127 \
+  && grep -q -w "not GCS compatible" "$logfile" ; then
+  exit 0
+elif test $status -eq 77; then
+  exit 77
+else
+  echo "unexpected test output or exit status $status"
+  exit 1
+fi
diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-disabled-static.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-disabled-static.c
new file mode 100644
index 000000000..c71d68cb8
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-disabled-static.c
@@ -0,0 +1 @@
+#include "tst-gcs-disabled.c"
diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-disabled.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-disabled.c
new file mode 100644
index 000000000..bd688785b
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-disabled.c
@@ -0,0 +1,2 @@
+#define TEST_GCS_EXPECT_ENABLED 0
+#include "tst-gcs-skeleton.c"
diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-disabled.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-disabled.c
new file mode 100644
index 000000000..34395280a
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-disabled.c
@@ -0,0 +1,3 @@
+#define TEST_GCS_EXPECT_ENABLED 0
+#define TEST_GCS_EXPECT_DLOPEN 1
+#include "tst-gcs-dlopen.c"
diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-enforced.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-enforced.c
new file mode 100644
index 000000000..d8489ecd2
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-enforced.c
@@ -0,0 +1,3 @@
+#define TEST_GCS_EXPECT_ENABLED 1
+#define TEST_GCS_EXPECT_DLOPEN 0
+#include "tst-gcs-dlopen.c"
diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-optional-off.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-optional-off.c
new file mode 100644
index 000000000..34395280a
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-optional-off.c
@@ -0,0 +1,3 @@
+#define TEST_GCS_EXPECT_ENABLED 0
+#define TEST_GCS_EXPECT_DLOPEN 1
+#include "tst-gcs-dlopen.c"
diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-optional-on.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-optional-on.c
new file mode 100644
index 000000000..d8489ecd2
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-optional-on.c
@@ -0,0 +1,3 @@
+#define TEST_GCS_EXPECT_ENABLED 1
+#define TEST_GCS_EXPECT_DLOPEN 0
+#include "tst-gcs-dlopen.c"
diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-override.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-override.c
new file mode 100644
index 000000000..152ffcf20
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen-override.c
@@ -0,0 +1,3 @@
+#define TEST_GCS_EXPECT_ENABLED 1
+#define TEST_GCS_EXPECT_DLOPEN 1
+#include "tst-gcs-dlopen.c"
diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen.c
new file mode 100644
index 000000000..6e0801c63
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-dlopen.c
@@ -0,0 +1,62 @@
+/* AArch64 tests for GCS for dlopen use case.
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "tst-gcs-helper.h"
+
+#include <dlfcn.h>
+#include <string.h>
+
+static int
+do_test (void)
+{
+  /* Check if GCS could possible by enabled.  */
+  if (!(getauxval (AT_HWCAP) & HWCAP_GCS))
+    {
+      puts ("kernel or CPU does not support GCS");
+      return EXIT_UNSUPPORTED;
+    }
+  /* The tst-gcs-mod2.so test library does not have GCS marking.  */
+  void *h = dlopen ("tst-gcs-mod2.so", RTLD_NOW);
+  const char *err = dlerror ();
+
+#if TEST_GCS_EXPECT_DLOPEN
+  TEST_VERIFY (h != NULL);
+#else
+  TEST_VERIFY (h == NULL);
+  /* Only accept expected GCS-related errors.  */
+  TEST_VERIFY (strstr (err, "not GCS compatible") != NULL);
+#endif
+
+#if TEST_GCS_EXPECT_ENABLED
+  TEST_VERIFY (__check_gcs_status ());
+#else
+  TEST_VERIFY (!__check_gcs_status ());
+#endif
+
+  if (h == NULL)
+    printf ("dlopen error: %s\n", err);
+  else
+    {
+      puts ("library loaded normally");
+      dlclose (h);
+    }
+
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-enforced-abort.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-enforced-abort.c
new file mode 100644
index 000000000..608318f26
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-enforced-abort.c
@@ -0,0 +1,2 @@
+#define TEST_GCS_EXPECT_ENABLED 1
+#include "tst-gcs-skeleton.c"
diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-enforced-static-abort.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-enforced-static-abort.c
new file mode 100644
index 000000000..c20a999f6
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-enforced-static-abort.c
@@ -0,0 +1 @@
+#include "tst-gcs-enforced-abort.c"
diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-enforced-static.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-enforced-static.c
new file mode 100644
index 000000000..bb39dada5
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-enforced-static.c
@@ -0,0 +1 @@
+#include "tst-gcs-enforced.c"
diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-enforced.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-enforced.c
new file mode 100644
index 000000000..608318f26
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-enforced.c
@@ -0,0 +1,2 @@
+#define TEST_GCS_EXPECT_ENABLED 1
+#include "tst-gcs-skeleton.c"
diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-helper.h b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-helper.h
new file mode 100644
index 000000000..d8a586d2d
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-helper.h
@@ -0,0 +1,39 @@
+/* AArch64 tests for GCS.
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#ifndef TST_GCS_HELPER_H
+#define TST_GCS_HELPER_H
+
+#include <support/check.h>
+#include <support/support.h>
+#include <support/test-driver.h>
+
+#include <stdio.h>
+#include <sys/auxv.h>
+
+static bool __check_gcs_status (void)
+{
+  register unsigned long x16 asm ("x16");
+  asm volatile (
+    "mov	x16, #1 /* _CHKFEAT_GCS */\n"
+    "hint	40 /* CHKFEAT_X16 */\n"
+    : "=r" (x16));
+  return x16 ^ 1;
+}
+
+#endif // POINTER_GUARD_H
diff --git a/sysdeps/powerpc/powerpc64/multiarch/strncmp-power10.S b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-mod1.c
similarity index 72%
rename from sysdeps/powerpc/powerpc64/multiarch/strncmp-power10.S
rename to sysdeps/unix/sysv/linux/aarch64/tst-gcs-mod1.c
index 43879085e..931ff8179 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/strncmp-power10.S
+++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-mod1.c
@@ -1,4 +1,5 @@
-/* Copyright (C) 2024-2025 Free Software Foundation, Inc.
+/* DSO for testing GCS.
+   Copyright (C) 2025 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -15,11 +16,12 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
-#define STRNCMP __strncmp_power10
+#include <stdio.h>
 
-#undef libc_hidden_builtin_def
-#define libc_hidden_builtin_def(name)
+int fun2 (void); // tst-gcs-mod2.c
 
-#include <sysdeps/powerpc/powerpc64/le/power10/strncmp.S>
-#endif
+int fun1 (void)
+{
+  puts ("called function fun1");
+  return fun2 ();
+}
diff --git a/sysdeps/powerpc/powerpc64/multiarch/memchr-power10.S b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-mod2.c
similarity index 66%
rename from sysdeps/powerpc/powerpc64/multiarch/memchr-power10.S
rename to sysdeps/unix/sysv/linux/aarch64/tst-gcs-mod2.c
index c9d2f4efd..f9370eb8f 100644
--- a/sysdeps/powerpc/powerpc64/multiarch/memchr-power10.S
+++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-mod2.c
@@ -1,5 +1,5 @@
-/* Optimized memchr implementation for POWER10/PPC64.
-   Copyright (C) 2016-2025 Free Software Foundation, Inc.
+/* DSO for testing GCS.
+   Copyright (C) 2025 Free Software Foundation, Inc.
    This file is part of the GNU C Library.
 
    The GNU C Library is free software; you can redistribute it and/or
@@ -16,13 +16,10 @@
    License along with the GNU C Library; if not, see
    <https://www.gnu.org/licenses/>.  */
 
-#if defined __LITTLE_ENDIAN__ && IS_IN (libc)
-#define MEMCHR __memchr_power10
+#include <stdio.h>
 
-#undef libc_hidden_builtin_def
-#define libc_hidden_builtin_def(name)
-#undef weak_alias
-#define weak_alias(name,alias)
-
-#include <sysdeps/powerpc/powerpc64/le/power10/memchr.S>
-#endif
+int fun2 (void)
+{
+  puts ("called function fun2");
+  return 0;
+}
diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-mod3.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-mod3.c
new file mode 100644
index 000000000..38bb35754
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-mod3.c
@@ -0,0 +1,25 @@
+/* DSO for testing GCS.
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <stdio.h>
+
+int fun3 (void)
+{
+  puts ("called function fun3");
+  return 0;
+}
diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-noreturn.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-noreturn.c
new file mode 100644
index 000000000..f55057924
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-noreturn.c
@@ -0,0 +1,101 @@
+/* AArch64 test for GCS abort when returning to non-GCS address.
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "tst-gcs-helper.h"
+
+#include <sys/prctl.h>
+#include <stdlib.h>
+
+#include <support/xsignal.h>
+
+# ifndef PR_SET_SHADOW_STACK_STATUS
+#  define PR_SET_SHADOW_STACK_STATUS	75
+#  define PR_SHADOW_STACK_ENABLE	(1UL << 0)
+# endif
+
+static void
+run_with_gcs (void)
+{
+  int r = prctl (PR_SET_SHADOW_STACK_STATUS, PR_SHADOW_STACK_ENABLE, 0, 0, 0);
+  /* Syscall should succeed.  */
+  TEST_VERIFY (r == 0);
+  bool gcs_enabled = __check_gcs_status ();
+  /* Now GCS should be enabled.  */
+  TEST_VERIFY (gcs_enabled);
+  printf ("GCS is %s\n", gcs_enabled ? "enabled" : "disabled");
+}
+
+static struct _aarch64_ctx *
+extension (void *p)
+{
+  return p;
+}
+
+#ifndef GCS_MAGIC
+#define GCS_MAGIC 0x47435300
+#endif
+
+static void
+handler (int sig, siginfo_t *si, void *ctx)
+{
+  TEST_VERIFY (sig == SIGSEGV);
+  ucontext_t *uc = ctx;
+  void *p = uc->uc_mcontext.__reserved;
+  if (extension (p)->magic == FPSIMD_MAGIC)
+    p = (char *)p + extension (p)->size;
+  if (extension (p)->magic == GCS_MAGIC)
+    {
+      struct { uint64_t x, gcspr, y, z; } *q = p;
+      printf ("GCS pointer: %016lx\n", q->gcspr);
+      exit (0);
+    }
+  else
+    exit (3);
+}
+
+static int
+do_test (void)
+{
+  /* Check if GCS could possible by enabled.  */
+  if (!(getauxval (AT_HWCAP) & HWCAP_GCS))
+    {
+      puts ("kernel or CPU does not support GCS");
+      return EXIT_UNSUPPORTED;
+    }
+  bool gcs_enabled = __check_gcs_status ();
+  /* This test should be rung with GCS initially disabled.  */
+  TEST_VERIFY (!gcs_enabled);
+
+  /* We can't use EXPECTED_SIGNAL because of cases when
+     this test runs on a system that does not support GCS
+     which is being detected at runtime.  */
+  struct sigaction sigact;
+  sigemptyset (&sigact.sa_mask);
+  sigact.sa_flags = 0;
+  sigact.sa_flags = sigact.sa_flags | SA_SIGINFO;
+  sigact.sa_sigaction = handler;
+  xsigaction (SIGSEGV, &sigact, NULL);
+
+  run_with_gcs ();
+  /* If we reached this point, then something went wrong.
+     Returning from a function that enabled GCS should result in
+     SIGSEGV that we catch with the handler set up above.  */
+  return 2;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-optional-off.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-optional-off.c
new file mode 100644
index 000000000..bd688785b
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-optional-off.c
@@ -0,0 +1,2 @@
+#define TEST_GCS_EXPECT_ENABLED 0
+#include "tst-gcs-skeleton.c"
diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-optional-on.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-optional-on.c
new file mode 100644
index 000000000..608318f26
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-optional-on.c
@@ -0,0 +1,2 @@
+#define TEST_GCS_EXPECT_ENABLED 1
+#include "tst-gcs-skeleton.c"
diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-optional-static-off.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-optional-static-off.c
new file mode 100644
index 000000000..54e3b9a0d
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-optional-static-off.c
@@ -0,0 +1 @@
+#include "tst-gcs-optional-off.c"
diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-optional-static-on.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-optional-static-on.c
new file mode 100644
index 000000000..11b884b42
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-optional-static-on.c
@@ -0,0 +1 @@
+#include "tst-gcs-optional-on.c"
diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-override-static.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-override-static.c
new file mode 100644
index 000000000..09055dcdc
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-override-static.c
@@ -0,0 +1 @@
+#include "tst-gcs-override.c"
diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-override.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-override.c
new file mode 100644
index 000000000..608318f26
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-override.c
@@ -0,0 +1,2 @@
+#define TEST_GCS_EXPECT_ENABLED 1
+#include "tst-gcs-skeleton.c"
diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared-disabled.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared-disabled.c
new file mode 100644
index 000000000..8598dc44b
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared-disabled.c
@@ -0,0 +1,2 @@
+#define TEST_GCS_EXPECT_ENABLED 0
+#include "tst-gcs-shared.c"
diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared-enforced-abort.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared-enforced-abort.c
new file mode 100644
index 000000000..f1333cee9
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared-enforced-abort.c
@@ -0,0 +1,2 @@
+#define TEST_GCS_EXPECT_ENABLED 1
+#include "tst-gcs-shared.c"
diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared-optional.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared-optional.c
new file mode 100644
index 000000000..8598dc44b
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared-optional.c
@@ -0,0 +1,2 @@
+#define TEST_GCS_EXPECT_ENABLED 0
+#include "tst-gcs-shared.c"
diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared-override.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared-override.c
new file mode 100644
index 000000000..f1333cee9
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared-override.c
@@ -0,0 +1,2 @@
+#define TEST_GCS_EXPECT_ENABLED 1
+#include "tst-gcs-shared.c"
diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared.c
new file mode 100644
index 000000000..1192de69f
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-shared.c
@@ -0,0 +1,41 @@
+/* AArch64 tests for GCS.
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "tst-gcs-helper.h"
+
+int fun1 (void); // tst-gcs-mod1.c
+int fun3 (void); // tst-gcs-mod3.c
+
+static int
+do_test (void)
+{
+  /* Check if GCS could possible by enabled.  */
+  if (!(getauxval (AT_HWCAP) & HWCAP_GCS))
+    {
+      puts ("kernel or CPU does not support GCS");
+      return EXIT_UNSUPPORTED;
+    }
+#if TEST_GCS_EXPECT_ENABLED
+  TEST_VERIFY (__check_gcs_status ());
+#else
+  TEST_VERIFY (!__check_gcs_status ());
+#endif
+  return fun1 () + fun3 ();
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/unix/sysv/linux/aarch64/tst-gcs-skeleton.c b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-skeleton.c
new file mode 100644
index 000000000..feb5e33eb
--- /dev/null
+++ b/sysdeps/unix/sysv/linux/aarch64/tst-gcs-skeleton.c
@@ -0,0 +1,43 @@
+/* AArch64 tests for GCS.
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include "tst-gcs-helper.h"
+
+static int
+do_test (void)
+{
+  /* Check if GCS could possible by enabled.  */
+  if (!(getauxval (AT_HWCAP) & HWCAP_GCS))
+    {
+      puts ("kernel or CPU does not support GCS");
+      return EXIT_UNSUPPORTED;
+    }
+  bool gcs_enabled = __check_gcs_status ();
+  if (gcs_enabled)
+    puts ("GCS enabled");
+  else
+    puts ("GCS not enabled");
+#if TEST_GCS_EXPECT_ENABLED
+  TEST_VERIFY (gcs_enabled);
+#else
+  TEST_VERIFY (!gcs_enabled);
+#endif
+  return 0;
+}
+
+#include <support/test-driver.c>
diff --git a/sysdeps/unix/sysv/linux/bits/sched.h b/sysdeps/unix/sysv/linux/bits/sched.h
index 3656e98ed..39b0b3d19 100644
--- a/sysdeps/unix/sysv/linux/bits/sched.h
+++ b/sysdeps/unix/sysv/linux/bits/sched.h
@@ -152,7 +152,7 @@ int sched_setattr (pid_t tid, struct sched_attr *attr, unsigned int flags)
    store it in *ATTR.  */
 int sched_getattr (pid_t tid, struct sched_attr *attr, unsigned int size,
 		   unsigned int flags)
-  __THROW __nonnull ((2)) __attr_access ((__write_only__, 2, 3));
+  __THROW __nonnull ((2));
 
 #endif
 
diff --git a/sysdeps/unix/sysv/linux/dl-execstack.c b/sysdeps/unix/sysv/linux/dl-execstack.c
index 9791b339c..6db960165 100644
--- a/sysdeps/unix/sysv/linux/dl-execstack.c
+++ b/sysdeps/unix/sysv/linux/dl-execstack.c
@@ -19,10 +19,10 @@
 #include <ldsodefs.h>
 
 int
-_dl_make_stack_executable (void **stack_endp)
+_dl_make_stack_executable (const void *stack_endp)
 {
   /* This gives us the highest/lowest page that needs to be changed.  */
-  uintptr_t page = ((uintptr_t) *stack_endp
+  uintptr_t page = ((uintptr_t) stack_endp
 		    & -(intptr_t) GLRO(dl_pagesize));
 
   if (__mprotect ((void *) page, GLRO(dl_pagesize),
@@ -35,9 +35,6 @@ _dl_make_stack_executable (void **stack_endp)
 		  ) != 0)
     return errno;
 
-  /* Clear the address.  */
-  *stack_endp = NULL;
-
   /* Remember that we changed the permission.  */
   GL(dl_stack_flags) |= PF_X;
 
diff --git a/sysdeps/unix/sysv/linux/rseq-internal.h b/sysdeps/unix/sysv/linux/rseq-internal.h
index f89e78424..d2ab4cb82 100644
--- a/sysdeps/unix/sysv/linux/rseq-internal.h
+++ b/sysdeps/unix/sysv/linux/rseq-internal.h
@@ -108,13 +108,12 @@ rseq_register_current_thread (struct pthread *self, bool do_rseq)
       if (size < RSEQ_AREA_SIZE_INITIAL)
         size = RSEQ_AREA_SIZE_INITIAL;
 
-      /* Initialize the rseq fields that are read by the kernel on
-         registration, there is no guarantee that struct pthread is
-         cleared on all architectures.  */
+      /* Initialize the whole rseq area to zero prior to registration.  */
+      memset (RSEQ_SELF (), 0, size);
+
+      /* Set the cpu_id field to RSEQ_CPU_ID_UNINITIALIZED, this is checked by
+         the kernel at registration when CONFIG_DEBUG_RSEQ is enabled.  */
       RSEQ_SETMEM (cpu_id, RSEQ_CPU_ID_UNINITIALIZED);
-      RSEQ_SETMEM (cpu_id_start, 0);
-      RSEQ_SETMEM (rseq_cs, 0);
-      RSEQ_SETMEM (flags, 0);
 
       int ret = INTERNAL_SYSCALL_CALL (rseq, RSEQ_SELF (), size, 0, RSEQ_SIG);
       if (!INTERNAL_SYSCALL_ERROR_P (ret))
diff --git a/sysdeps/x86/Makefile b/sysdeps/x86/Makefile
index 5311b594a..01b0192dd 100644
--- a/sysdeps/x86/Makefile
+++ b/sysdeps/x86/Makefile
@@ -21,6 +21,9 @@ tests += \
   tst-cpu-features-supports-static \
   tst-get-cpu-features \
   tst-get-cpu-features-static \
+  tst-gnu2-tls2-x86-noxsave \
+  tst-gnu2-tls2-x86-noxsavec \
+  tst-gnu2-tls2-x86-noxsavexsavec \
   tst-hwcap-tunables \
 # tests
 tests-static += \
@@ -91,6 +94,25 @@ CFLAGS-tst-gnu2-tls2.c += -msse
 CFLAGS-tst-gnu2-tls2mod0.c += -msse2 -mtune=haswell
 CFLAGS-tst-gnu2-tls2mod1.c += -msse2 -mtune=haswell
 CFLAGS-tst-gnu2-tls2mod2.c += -msse2 -mtune=haswell
+
+LDFLAGS-tst-gnu2-tls2-x86-noxsave += -Wl,-z,lazy
+LDFLAGS-tst-gnu2-tls2-x86-noxsavec += -Wl,-z,lazy
+LDFLAGS-tst-gnu2-tls2-x86-noxsavexsavec += -Wl,-z,lazy
+
+# Test for bug 32810: incorrect XSAVE state size if XSAVEC is disabled
+# via tunable.
+tst-gnu2-tls2-x86-noxsave-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVE
+tst-gnu2-tls2-x86-noxsavec-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVEC
+tst-gnu2-tls2-x86-noxsavexsavec-ENV = GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVE,-XSAVEC
+$(objpfx)tst-gnu2-tls2-x86-noxsave: $(shared-thread-library)
+$(objpfx)tst-gnu2-tls2-x86-noxsavec: $(shared-thread-library)
+$(objpfx)tst-gnu2-tls2-x86-noxsavexsavec: $(shared-thread-library)
+$(objpfx)tst-gnu2-tls2-x86-noxsave.out \
+$(objpfx)tst-gnu2-tls2-x86-noxsavec.out \
+$(objpfx)tst-gnu2-tls2-x86-noxsavexsavec.out: \
+  $(objpfx)tst-gnu2-tls2mod0.so \
+  $(objpfx)tst-gnu2-tls2mod1.so \
+  $(objpfx)tst-gnu2-tls2mod2.so
 endif
 
 ifeq ($(subdir),math)
diff --git a/sysdeps/x86/bits/floatn.h b/sysdeps/x86/bits/floatn.h
index d197cb10d..4674165bd 100644
--- a/sysdeps/x86/bits/floatn.h
+++ b/sysdeps/x86/bits/floatn.h
@@ -25,11 +25,15 @@
    floating-point type with the IEEE 754 binary128 format, and this
    glibc includes corresponding *f128 interfaces for it.  The required
    libgcc support was added some time after the basic compiler
-   support, for x86_64 and x86.  */
+   support, for x86_64 and x86.  Intel SYCL compiler doesn't support
+   _Float128: https://github.com/intel/llvm/issues/16903
+  */
 #if (defined __x86_64__							\
      ? __GNUC_PREREQ (4, 3)						\
      : (defined __GNU__ ? __GNUC_PREREQ (4, 5) : __GNUC_PREREQ (4, 4))) \
-    || __glibc_clang_prereq (3, 4)
+    || (__glibc_clang_prereq (3, 9)					\
+	&& (!defined __INTEL_LLVM_COMPILER				\
+	    || !defined SYCL_LANGUAGE_VERSION))
 # define __HAVE_FLOAT128 1
 #else
 # define __HAVE_FLOAT128 0
@@ -89,7 +93,7 @@ typedef _Complex float __cfloat128 __attribute__ ((__mode__ (__TC__)));
 /* The type _Float128 exists only since GCC 7.0.  */
 #  if !__GNUC_PREREQ (7, 0) \
       || (defined __cplusplus && !__GNUC_PREREQ (13, 0)) \
-      || __glibc_clang_prereq (3, 4)
+      || __glibc_clang_prereq (3, 9)
 typedef __float128 _Float128;
 #  endif
 
diff --git a/sysdeps/x86/cpu-features.c b/sysdeps/x86/cpu-features.c
index 27abaca8b..e50f1d693 100644
--- a/sysdeps/x86/cpu-features.c
+++ b/sysdeps/x86/cpu-features.c
@@ -24,6 +24,7 @@
 #include <dl-cacheinfo.h>
 #include <dl-minsigstacksize.h>
 #include <dl-hwcap2.h>
+#include <gcc-macros.h>
 
 extern void TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *)
   attribute_hidden;
@@ -83,6 +84,8 @@ extern void TUNABLE_CALLBACK (set_x86_shstk) (tunable_val_t *)
 # include <dl-cet.h>
 #endif
 
+unsigned long int _dl_x86_features_tlsdesc_state_size;
+
 static void
 update_active (struct cpu_features *cpu_features)
 {
@@ -317,17 +320,13 @@ update_active (struct cpu_features *cpu_features)
 		= xsave_state_full_size;
 	      cpu_features->xsave_state_full_size
 		= xsave_state_full_size;
+	      _dl_x86_features_tlsdesc_state_size = xsave_state_full_size;
 
 	      /* Check if XSAVEC is available.  */
 	      if (CPU_FEATURES_CPU_P (cpu_features, XSAVEC))
 		{
-		  unsigned int xstate_comp_offsets[32];
-		  unsigned int xstate_comp_sizes[32];
-#ifdef __x86_64__
-		  unsigned int xstate_amx_comp_offsets[32];
-		  unsigned int xstate_amx_comp_sizes[32];
-		  unsigned int amx_ecx;
-#endif
+		  unsigned int xstate_comp_offsets[X86_XSTATE_MAX_ID + 1];
+		  unsigned int xstate_comp_sizes[X86_XSTATE_MAX_ID + 1];
 		  unsigned int i;
 
 		  xstate_comp_offsets[0] = 0;
@@ -335,39 +334,16 @@ update_active (struct cpu_features *cpu_features)
 		  xstate_comp_offsets[2] = 576;
 		  xstate_comp_sizes[0] = 160;
 		  xstate_comp_sizes[1] = 256;
-#ifdef __x86_64__
-		  xstate_amx_comp_offsets[0] = 0;
-		  xstate_amx_comp_offsets[1] = 160;
-		  xstate_amx_comp_offsets[2] = 576;
-		  xstate_amx_comp_sizes[0] = 160;
-		  xstate_amx_comp_sizes[1] = 256;
-#endif
 
-		  for (i = 2; i < 32; i++)
+		  for (i = 2; i <= X86_XSTATE_MAX_ID; i++)
 		    {
 		      if ((FULL_STATE_SAVE_MASK & (1 << i)) != 0)
 			{
 			  __cpuid_count (0xd, i, eax, ebx, ecx, edx);
-#ifdef __x86_64__
-			  /* Include this in xsave_state_full_size.  */
-			  amx_ecx = ecx;
-			  xstate_amx_comp_sizes[i] = eax;
-			  if ((AMX_STATE_SAVE_MASK & (1 << i)) != 0)
-			    {
-			      /* Exclude this from xsave_state_size.  */
-			      ecx = 0;
-			      xstate_comp_sizes[i] = 0;
-			    }
-			  else
-#endif
-			    xstate_comp_sizes[i] = eax;
+			  xstate_comp_sizes[i] = eax;
 			}
 		      else
 			{
-#ifdef __x86_64__
-			  amx_ecx = 0;
-			  xstate_amx_comp_sizes[i] = 0;
-#endif
 			  ecx = 0;
 			  xstate_comp_sizes[i] = 0;
 			}
@@ -376,44 +352,32 @@ update_active (struct cpu_features *cpu_features)
 			{
 			  xstate_comp_offsets[i]
 			    = (xstate_comp_offsets[i - 1]
-			       + xstate_comp_sizes[i -1]);
+			       + xstate_comp_sizes[i - 1]);
 			  if ((ecx & (1 << 1)) != 0)
 			    xstate_comp_offsets[i]
 			      = ALIGN_UP (xstate_comp_offsets[i], 64);
-#ifdef __x86_64__
-			  xstate_amx_comp_offsets[i]
-			    = (xstate_amx_comp_offsets[i - 1]
-			       + xstate_amx_comp_sizes[i - 1]);
-			  if ((amx_ecx & (1 << 1)) != 0)
-			    xstate_amx_comp_offsets[i]
-			      = ALIGN_UP (xstate_amx_comp_offsets[i],
-					  64);
-#endif
 			}
 		    }
 
 		  /* Use XSAVEC.  */
 		  unsigned int size
-		    = xstate_comp_offsets[31] + xstate_comp_sizes[31];
+		    = (xstate_comp_offsets[X86_XSTATE_MAX_ID]
+		       + xstate_comp_sizes[X86_XSTATE_MAX_ID]);
 		  if (size)
 		    {
+		      size = ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_AREA,
+				       64);
 #ifdef __x86_64__
-		      unsigned int amx_size
-			= (xstate_amx_comp_offsets[31]
-			   + xstate_amx_comp_sizes[31]);
-		      amx_size
-			= ALIGN_UP ((amx_size
-				     + TLSDESC_CALL_REGISTER_SAVE_AREA),
-				    64);
-		      /* Set xsave_state_full_size to the compact AMX
-			 state size for XSAVEC.  NB: xsave_state_full_size
-			 is only used in _dl_tlsdesc_dynamic_xsave and
-			 _dl_tlsdesc_dynamic_xsavec.  */
-		      cpu_features->xsave_state_full_size = amx_size;
+		      _dl_x86_features_tlsdesc_state_size = size;
+		      /* Exclude the AMX space from the start of TILECFG
+			 space to the end of TILEDATA space.  If CPU
+			 doesn't support AMX, TILECFG offset is the same
+			 as TILEDATA + 1 offset.  Otherwise, they are
+			 multiples of 64.  */
+		      size -= (xstate_comp_offsets[X86_XSTATE_TILEDATA_ID + 1]
+			       - xstate_comp_offsets[X86_XSTATE_TILECFG_ID]);
 #endif
-		      cpu_features->xsave_state_size
-			= ALIGN_UP (size + TLSDESC_CALL_REGISTER_SAVE_AREA,
-				    64);
+		      cpu_features->xsave_state_size = size;
 		      CPU_FEATURE_SET (cpu_features, XSAVEC);
 		    }
 		}
@@ -538,8 +502,8 @@ _Static_assert (((index_arch_Fast_Unaligned_Load
 		"Incorrect index_arch_Fast_Unaligned_Load");
 
 
-/* Intel Family-6 microarch list.  */
-enum
+/* Intel microarch list.  */
+enum intel_microarch
 {
   /* Atom processors.  */
   INTEL_ATOM_BONNELL,
@@ -548,6 +512,7 @@ enum
   INTEL_ATOM_GOLDMONT,
   INTEL_ATOM_GOLDMONT_PLUS,
   INTEL_ATOM_SIERRAFOREST,
+  INTEL_ATOM_CLEARWATERFOREST,
   INTEL_ATOM_GRANDRIDGE,
   INTEL_ATOM_TREMONT,
 
@@ -575,7 +540,9 @@ enum
   INTEL_BIGCORE_METEORLAKE,
   INTEL_BIGCORE_LUNARLAKE,
   INTEL_BIGCORE_ARROWLAKE,
+  INTEL_BIGCORE_PANTHERLAKE,
   INTEL_BIGCORE_GRANITERAPIDS,
+  INTEL_BIGCORE_DIAMONDRAPIDS,
 
   /* Mixed (bigcore + atom SOC).  */
   INTEL_MIXED_LAKEFIELD,
@@ -589,7 +556,7 @@ enum
   INTEL_UNKNOWN,
 };
 
-static unsigned int
+static enum intel_microarch
 intel_get_fam6_microarch (unsigned int model,
 			  __attribute__ ((unused)) unsigned int stepping)
 {
@@ -620,6 +587,8 @@ intel_get_fam6_microarch (unsigned int model,
       return INTEL_ATOM_GOLDMONT_PLUS;
     case 0xAF:
       return INTEL_ATOM_SIERRAFOREST;
+    case 0xDD:
+      return INTEL_ATOM_CLEARWATERFOREST;
     case 0xB6:
       return INTEL_ATOM_GRANDRIDGE;
     case 0x86:
@@ -727,8 +696,12 @@ intel_get_fam6_microarch (unsigned int model,
       return INTEL_BIGCORE_METEORLAKE;
     case 0xbd:
       return INTEL_BIGCORE_LUNARLAKE;
+    case 0xb5:
+    case 0xc5:
     case 0xc6:
       return INTEL_BIGCORE_ARROWLAKE;
+    case 0xCC:
+      return INTEL_BIGCORE_PANTHERLAKE;
     case 0xAD:
     case 0xAE:
       return INTEL_BIGCORE_GRANITERAPIDS;
@@ -792,133 +765,20 @@ init_cpu_features (struct cpu_features *cpu_features)
       cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
 	  &= ~bit_arch_Avoid_Non_Temporal_Memset;
 
+      enum intel_microarch microarch = INTEL_UNKNOWN;
       if (family == 0x06)
 	{
 	  model += extended_model;
-	  unsigned int microarch
-	      = intel_get_fam6_microarch (model, stepping);
+	  microarch = intel_get_fam6_microarch (model, stepping);
 
+	  /* Disable TSX on some processors to avoid TSX on kernels that
+	     weren't updated with the latest microcode package (which
+	     disables broken feature by default).  */
 	  switch (microarch)
 	    {
-	      /* Atom / KNL tuning.  */
-	    case INTEL_ATOM_BONNELL:
-	      /* BSF is slow on Bonnell.  */
-	      cpu_features->preferred[index_arch_Slow_BSF]
-		  |= bit_arch_Slow_BSF;
-	      break;
-
-	      /* Unaligned load versions are faster than SSSE3
-		     on Airmont, Silvermont, Goldmont, and Goldmont Plus.  */
-	    case INTEL_ATOM_AIRMONT:
-	    case INTEL_ATOM_SILVERMONT:
-	    case INTEL_ATOM_GOLDMONT:
-	    case INTEL_ATOM_GOLDMONT_PLUS:
-
-          /* Knights Landing.  Enable Silvermont optimizations.  */
-	    case INTEL_KNIGHTS_LANDING:
-
-	      cpu_features->preferred[index_arch_Fast_Unaligned_Load]
-		  |= (bit_arch_Fast_Unaligned_Load
-		      | bit_arch_Fast_Unaligned_Copy
-		      | bit_arch_Prefer_PMINUB_for_stringop
-		      | bit_arch_Slow_SSE4_2);
-	      break;
-
-	    case INTEL_ATOM_TREMONT:
-	      /* Enable rep string instructions, unaligned load, unaligned
-		 copy, pminub and avoid SSE 4.2 on Tremont.  */
-	      cpu_features->preferred[index_arch_Fast_Rep_String]
-		  |= (bit_arch_Fast_Rep_String
-		      | bit_arch_Fast_Unaligned_Load
-		      | bit_arch_Fast_Unaligned_Copy
-		      | bit_arch_Prefer_PMINUB_for_stringop
-		      | bit_arch_Slow_SSE4_2);
-	      break;
-
-	   /*
-	    Default tuned Knights microarch.
-	    case INTEL_KNIGHTS_MILL:
-        */
-
-	   /*
-	    Default tuned atom microarch.
-	    case INTEL_ATOM_SIERRAFOREST:
-	    case INTEL_ATOM_GRANDRIDGE:
-	   */
-
-	      /* Bigcore/Default Tuning.  */
 	    default:
-	    default_tuning:
-	      /* Unknown family 0x06 processors.  Assuming this is one
-		 of Core i3/i5/i7 processors if AVX is available.  */
-	      if (!CPU_FEATURES_CPU_P (cpu_features, AVX))
-		break;
-
-	    enable_modern_features:
-	      /* Rep string instructions, unaligned load, unaligned copy,
-		 and pminub are fast on Intel Core i3, i5 and i7.  */
-	      cpu_features->preferred[index_arch_Fast_Rep_String]
-		  |= (bit_arch_Fast_Rep_String
-		      | bit_arch_Fast_Unaligned_Load
-		      | bit_arch_Fast_Unaligned_Copy
-		      | bit_arch_Prefer_PMINUB_for_stringop);
 	      break;
 
-	    case INTEL_BIGCORE_NEHALEM:
-	    case INTEL_BIGCORE_WESTMERE:
-	      /* Older CPUs prefer non-temporal stores at lower threshold.  */
-	      cpu_features->cachesize_non_temporal_divisor = 8;
-	      goto enable_modern_features;
-
-	      /* Older Bigcore microarch (smaller non-temporal store
-		 threshold).  */
-	    case INTEL_BIGCORE_SANDYBRIDGE:
-	    case INTEL_BIGCORE_IVYBRIDGE:
-	    case INTEL_BIGCORE_HASWELL:
-	    case INTEL_BIGCORE_BROADWELL:
-	      cpu_features->cachesize_non_temporal_divisor = 8;
-	      goto default_tuning;
-
-	      /* Newer Bigcore microarch (larger non-temporal store
-		 threshold).  */
-	    case INTEL_BIGCORE_SKYLAKE_AVX512:
-	    case INTEL_BIGCORE_CANNONLAKE:
-	      /* Benchmarks indicate non-temporal memset is not
-		     necessarily profitable on SKX (and in some cases much
-		     worse). This is likely unique to SKX due its it unique
-		     mesh interconnect (not present on ICX or BWD). Disable
-		     non-temporal on all Skylake servers. */
-	      cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
-		  |= bit_arch_Avoid_Non_Temporal_Memset;
-	      /* fallthrough */
-	    case INTEL_BIGCORE_COMETLAKE:
-	    case INTEL_BIGCORE_SKYLAKE:
-	    case INTEL_BIGCORE_KABYLAKE:
-	    case INTEL_BIGCORE_ICELAKE:
-	    case INTEL_BIGCORE_TIGERLAKE:
-	    case INTEL_BIGCORE_ROCKETLAKE:
-	    case INTEL_BIGCORE_RAPTORLAKE:
-	    case INTEL_BIGCORE_METEORLAKE:
-	    case INTEL_BIGCORE_LUNARLAKE:
-	    case INTEL_BIGCORE_ARROWLAKE:
-	    case INTEL_BIGCORE_SAPPHIRERAPIDS:
-	    case INTEL_BIGCORE_EMERALDRAPIDS:
-	    case INTEL_BIGCORE_GRANITERAPIDS:
-	      cpu_features->cachesize_non_temporal_divisor = 2;
-	      goto default_tuning;
-
-	      /* Default tuned Mixed (bigcore + atom SOC). */
-	    case INTEL_MIXED_LAKEFIELD:
-	    case INTEL_MIXED_ALDERLAKE:
-	      cpu_features->cachesize_non_temporal_divisor = 2;
-	      goto default_tuning;
-	    }
-
-	      /* Disable TSX on some processors to avoid TSX on kernels that
-		 weren't updated with the latest microcode package (which
-		 disables broken feature by default).  */
-	  switch (microarch)
-	    {
 	    case INTEL_BIGCORE_SKYLAKE_AVX512:
 	      /* 0x55 (Skylake-avx512) && stepping <= 5 disable TSX. */
 	      if (stepping <= 5)
@@ -927,38 +787,163 @@ init_cpu_features (struct cpu_features *cpu_features)
 
 	    case INTEL_BIGCORE_KABYLAKE:
 	      /* NB: Although the errata documents that for model == 0x8e
-		     (kabylake skylake client), only 0xb stepping or lower are
-		     impacted, the intention of the errata was to disable TSX on
-		     all client processors on all steppings.  Include 0xc
-		     stepping which is an Intel Core i7-8665U, a client mobile
-		     processor.  */
+		 (kabylake skylake client), only 0xb stepping or lower are
+		 impacted, the intention of the errata was to disable TSX on
+		 all client processors on all steppings.  Include 0xc
+		 stepping which is an Intel Core i7-8665U, a client mobile
+		 processor.  */
 	      if (stepping > 0xc)
 		break;
 	      /* Fall through.  */
 	    case INTEL_BIGCORE_SKYLAKE:
-		/* Disable Intel TSX and enable RTM_ALWAYS_ABORT for
-		   processors listed in:
-
-https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html
-		 */
-	    disable_tsx:
-		CPU_FEATURE_UNSET (cpu_features, HLE);
-		CPU_FEATURE_UNSET (cpu_features, RTM);
-		CPU_FEATURE_SET (cpu_features, RTM_ALWAYS_ABORT);
-		break;
+	      /* Disable Intel TSX and enable RTM_ALWAYS_ABORT for
+		 processors listed in:
+
+		 https://www.intel.com/content/www/us/en/support/articles/000059422/processors.html
+	       */
+disable_tsx:
+	      CPU_FEATURE_UNSET (cpu_features, HLE);
+	      CPU_FEATURE_UNSET (cpu_features, RTM);
+	      CPU_FEATURE_SET (cpu_features, RTM_ALWAYS_ABORT);
+	      break;
 
 	    case INTEL_BIGCORE_HASWELL:
-		/* Xeon E7 v3 (model == 0x3f) with stepping >= 4 has working
-		   TSX.  Haswell also include other model numbers that have
-		   working TSX.  */
-		if (model == 0x3f && stepping >= 4)
+	      /* Xeon E7 v3 (model == 0x3f) with stepping >= 4 has working
+		 TSX.  Haswell also includes other model numbers that have
+		 working TSX.  */
+	      if (model == 0x3f && stepping >= 4)
 		break;
 
-		CPU_FEATURE_UNSET (cpu_features, RTM);
-		break;
+	      CPU_FEATURE_UNSET (cpu_features, RTM);
+	      break;
 	    }
 	}
+      else if (family == 19)
+	switch (model)
+	  {
+	  case 0x01:
+	    microarch = INTEL_BIGCORE_DIAMONDRAPIDS;
+	    break;
 
+	  default:
+	    break;
+	  }
+
+      switch (microarch)
+	{
+	  /* Atom / KNL tuning.  */
+	case INTEL_ATOM_BONNELL:
+	  /* BSF is slow on Bonnell.  */
+	  cpu_features->preferred[index_arch_Slow_BSF]
+	    |= bit_arch_Slow_BSF;
+	  break;
+
+	  /* Unaligned load versions are faster than SSSE3
+	     on Airmont, Silvermont, Goldmont, and Goldmont Plus.  */
+	case INTEL_ATOM_AIRMONT:
+	case INTEL_ATOM_SILVERMONT:
+	case INTEL_ATOM_GOLDMONT:
+	case INTEL_ATOM_GOLDMONT_PLUS:
+
+	  /* Knights Landing.  Enable Silvermont optimizations.  */
+	case INTEL_KNIGHTS_LANDING:
+
+	  cpu_features->preferred[index_arch_Fast_Unaligned_Load]
+	    |= (bit_arch_Fast_Unaligned_Load
+		| bit_arch_Fast_Unaligned_Copy
+		| bit_arch_Prefer_PMINUB_for_stringop
+		| bit_arch_Slow_SSE4_2);
+	  break;
+
+	case INTEL_ATOM_TREMONT:
+	  /* Enable rep string instructions, unaligned load, unaligned
+	     copy, pminub and avoid SSE 4.2 on Tremont.  */
+	  cpu_features->preferred[index_arch_Fast_Rep_String]
+	    |= (bit_arch_Fast_Rep_String
+		| bit_arch_Fast_Unaligned_Load
+		| bit_arch_Fast_Unaligned_Copy
+		| bit_arch_Prefer_PMINUB_for_stringop
+		| bit_arch_Slow_SSE4_2);
+	  break;
+
+	  /*
+	     Default tuned Knights microarch.
+	     case INTEL_KNIGHTS_MILL:
+	     */
+
+	  /*
+	     Default tuned atom microarch.
+	     case INTEL_ATOM_SIERRAFOREST:
+	     case INTEL_ATOM_GRANDRIDGE:
+	     case INTEL_ATOM_CLEARWATERFOREST:
+	     */
+
+	  /* Bigcore/Default Tuning.  */
+	default:
+	default_tuning:
+	  /* Unknown Intel processors.  Assuming this is one of Core
+	     i3/i5/i7 processors if AVX is available.  */
+	  if (!CPU_FEATURES_CPU_P (cpu_features, AVX))
+	    break;
+
+	enable_modern_features:
+	  /* Rep string instructions, unaligned load, unaligned copy,
+	     and pminub are fast on Intel Core i3, i5 and i7.  */
+	  cpu_features->preferred[index_arch_Fast_Rep_String]
+	    |= (bit_arch_Fast_Rep_String
+		| bit_arch_Fast_Unaligned_Load
+		| bit_arch_Fast_Unaligned_Copy
+		| bit_arch_Prefer_PMINUB_for_stringop);
+	  break;
+
+	case INTEL_BIGCORE_NEHALEM:
+	case INTEL_BIGCORE_WESTMERE:
+	  /* Older CPUs prefer non-temporal stores at lower threshold.  */
+	  cpu_features->cachesize_non_temporal_divisor = 8;
+	  goto enable_modern_features;
+
+	  /* Older Bigcore microarch (smaller non-temporal store
+	     threshold).  */
+	case INTEL_BIGCORE_SANDYBRIDGE:
+	case INTEL_BIGCORE_IVYBRIDGE:
+	case INTEL_BIGCORE_HASWELL:
+	case INTEL_BIGCORE_BROADWELL:
+	  cpu_features->cachesize_non_temporal_divisor = 8;
+	  goto default_tuning;
+
+	  /* Newer Bigcore microarch (larger non-temporal store
+	     threshold).  */
+	case INTEL_BIGCORE_SKYLAKE_AVX512:
+	case INTEL_BIGCORE_CANNONLAKE:
+	  /* Benchmarks indicate non-temporal memset is not
+	     necessarily profitable on SKX (and in some cases much
+	     worse). This is likely unique to SKX due to its unique
+	     mesh interconnect (not present on ICX or BWD). Disable
+	     non-temporal on all Skylake servers. */
+	  cpu_features->preferred[index_arch_Avoid_Non_Temporal_Memset]
+	    |= bit_arch_Avoid_Non_Temporal_Memset;
+	  /* fallthrough */
+	case INTEL_BIGCORE_COMETLAKE:
+	case INTEL_BIGCORE_SKYLAKE:
+	case INTEL_BIGCORE_KABYLAKE:
+	case INTEL_BIGCORE_ICELAKE:
+	case INTEL_BIGCORE_TIGERLAKE:
+	case INTEL_BIGCORE_ROCKETLAKE:
+	case INTEL_BIGCORE_RAPTORLAKE:
+	case INTEL_BIGCORE_METEORLAKE:
+	case INTEL_BIGCORE_LUNARLAKE:
+	case INTEL_BIGCORE_ARROWLAKE:
+	case INTEL_BIGCORE_PANTHERLAKE:
+	case INTEL_BIGCORE_SAPPHIRERAPIDS:
+	case INTEL_BIGCORE_EMERALDRAPIDS:
+	case INTEL_BIGCORE_GRANITERAPIDS:
+	case INTEL_BIGCORE_DIAMONDRAPIDS:
+	  /* Default tuned Mixed (bigcore + atom SOC). */
+	case INTEL_MIXED_LAKEFIELD:
+	case INTEL_MIXED_ALDERLAKE:
+	  cpu_features->cachesize_non_temporal_divisor = 2;
+	  goto default_tuning;
+	}
 
       /* Since AVX512ER is unique to Xeon Phi, set Prefer_No_VZEROUPPER
          if AVX512ER is available.  Don't use AVX512 to avoid lower CPU
@@ -1159,6 +1144,9 @@ no_cpuid:
 	       TUNABLE_CALLBACK (set_prefer_map_32bit_exec));
 #endif
 
+  /* Do not add the logic to disable XSAVE/XSAVEC if this glibc build
+     requires AVX and therefore XSAVE or XSAVEC support.  */
+#ifndef GCCMACRO__AVX__
   bool disable_xsave_features = false;
 
   if (!CPU_FEATURE_USABLE_P (cpu_features, OSXSAVE))
@@ -1212,6 +1200,7 @@ no_cpuid:
 
       CPU_FEATURE_UNSET (cpu_features, FMA4);
     }
+#endif
 
 #ifdef __x86_64__
   GLRO(dl_hwcap) = HWCAP_X86_64;
diff --git a/sysdeps/x86/cpu-tunables.c b/sysdeps/x86/cpu-tunables.c
index 342317680..d692e0e0d 100644
--- a/sysdeps/x86/cpu-tunables.c
+++ b/sysdeps/x86/cpu-tunables.c
@@ -164,6 +164,8 @@ TUNABLE_CALLBACK (set_hwcaps) (tunable_val_t *valp)
 		  /* Update xsave_state_size to XSAVE state size.  */
 		  cpu_features->xsave_state_size
 		    = cpu_features->xsave_state_full_size;
+		  _dl_x86_features_tlsdesc_state_size
+		    = cpu_features->xsave_state_full_size;
 		  CPU_FEATURE_UNSET (cpu_features, XSAVEC);
 		}
 	    }
diff --git a/sysdeps/x86/dl-diagnostics-cpu.c b/sysdeps/x86/dl-diagnostics-cpu.c
index 7d0373602..870b1268d 100644
--- a/sysdeps/x86/dl-diagnostics-cpu.c
+++ b/sysdeps/x86/dl-diagnostics-cpu.c
@@ -89,6 +89,8 @@ _dl_diagnostics_cpu (void)
                             cpu_features->xsave_state_size);
   print_cpu_features_value ("xsave_state_full_size",
                             cpu_features->xsave_state_full_size);
+  print_cpu_features_value ("tlsdesc_state_full_size",
+                            _dl_x86_features_tlsdesc_state_size);
   print_cpu_features_value ("data_cache_size", cpu_features->data_cache_size);
   print_cpu_features_value ("shared_cache_size",
                             cpu_features->shared_cache_size);
diff --git a/sysdeps/x86/include/cpu-features.h b/sysdeps/x86/include/cpu-features.h
index 9c485d38e..fbf1b8911 100644
--- a/sysdeps/x86/include/cpu-features.h
+++ b/sysdeps/x86/include/cpu-features.h
@@ -935,8 +935,6 @@ struct cpu_features
   /* The full state size for XSAVE when XSAVEC is disabled by
 
      GLIBC_TUNABLES=glibc.cpu.hwcaps=-XSAVEC
-
-     and the AMX state size when XSAVEC is available.
    */
   unsigned int xsave_state_full_size;
   /* Data cache size for use in memory and string routines, typically
@@ -990,6 +988,13 @@ extern const struct cpu_features *_dl_x86_get_cpu_features (void)
 
 #define __get_cpu_features() _dl_x86_get_cpu_features()
 
+#if IS_IN (rtld) || IS_IN (libc)
+/* XSAVE/XSAVEC state size used by TLS descriptors.  Compared to
+   xsave_state_size from struct cpu_features, this includes additional
+   registers.  */
+extern unsigned long int _dl_x86_features_tlsdesc_state_size attribute_hidden;
+#endif
+
 #if defined (_LIBC) && !IS_IN (nonlib)
 /* Unused for x86.  */
 # define INIT_ARCH()
diff --git a/sysdeps/x86/sysdep.h b/sysdeps/x86/sysdep.h
index 541393f1d..c3c73e75d 100644
--- a/sysdeps/x86/sysdep.h
+++ b/sysdeps/x86/sysdep.h
@@ -102,6 +102,9 @@
    | (1 << X86_XSTATE_ZMM_ID)		\
    | (1 << X86_XSTATE_APX_F_ID))
 
+/* The maximum supported xstate ID.  */
+# define X86_XSTATE_MAX_ID	X86_XSTATE_APX_F_ID
+
 /* AMX state mask.  */
 # define AMX_STATE_SAVE_MASK		\
   ((1 << X86_XSTATE_TILECFG_ID) | (1 << X86_XSTATE_TILEDATA_ID))
@@ -123,6 +126,9 @@
    | (1 << X86_XSTATE_K_ID)		\
    | (1 << X86_XSTATE_ZMM_H_ID))
 
+/* The maximum supported xstate ID.  */
+# define X86_XSTATE_MAX_ID	X86_XSTATE_ZMM_H_ID
+
 /* States to be included in xsave_state_size.  */
 # define FULL_STATE_SAVE_MASK		STATE_SAVE_MASK
 #endif
diff --git a/sysdeps/x86/tst-gnu2-tls2-x86-noxsave.c b/sysdeps/x86/tst-gnu2-tls2-x86-noxsave.c
new file mode 100644
index 000000000..f0024c143
--- /dev/null
+++ b/sysdeps/x86/tst-gnu2-tls2-x86-noxsave.c
@@ -0,0 +1 @@
+#include <elf/tst-gnu2-tls2.c>
diff --git a/sysdeps/x86/tst-gnu2-tls2-x86-noxsavec.c b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavec.c
new file mode 100644
index 000000000..f0024c143
--- /dev/null
+++ b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavec.c
@@ -0,0 +1 @@
+#include <elf/tst-gnu2-tls2.c>
diff --git a/sysdeps/x86/tst-gnu2-tls2-x86-noxsavexsavec.c b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavexsavec.c
new file mode 100644
index 000000000..f0024c143
--- /dev/null
+++ b/sysdeps/x86/tst-gnu2-tls2-x86-noxsavexsavec.c
@@ -0,0 +1 @@
+#include <elf/tst-gnu2-tls2.c>
diff --git a/sysdeps/x86_64/Makefile b/sysdeps/x86_64/Makefile
index 9d31685e0..5723ec184 100644
--- a/sysdeps/x86_64/Makefile
+++ b/sysdeps/x86_64/Makefile
@@ -142,7 +142,6 @@ CFLAGS-tst-avxmod.c += $(AVX-CFLAGS)
 AVX512-CFLAGS = -mavx512f
 CFLAGS-tst-audit10-aux.c += $(AVX512-CFLAGS)
 CFLAGS-tst-auditmod10a.c += $(AVX512-CFLAGS)
-CFLAGS-tst-auditmod10b.c += $(AVX512-CFLAGS)
 CFLAGS-tst-avx512-aux.c += $(AVX512-CFLAGS)
 CFLAGS-tst-avx512mod.c += $(AVX512-CFLAGS)
 
diff --git a/sysdeps/x86_64/dl-tlsdesc-dynamic.h b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
index 9965ddd2c..4f496de8c 100644
--- a/sysdeps/x86_64/dl-tlsdesc-dynamic.h
+++ b/sysdeps/x86_64/dl-tlsdesc-dynamic.h
@@ -99,7 +99,7 @@ _dl_tlsdesc_dynamic:
 # endif
 #else
 	/* Allocate stack space of the required size to save the state.  */
-	sub	_rtld_local_ro+RTLD_GLOBAL_RO_DL_X86_CPU_FEATURES_OFFSET+XSAVE_STATE_FULL_SIZE_OFFSET(%rip), %RSP_LP
+	sub	_dl_x86_features_tlsdesc_state_size(%rip), %RSP_LP
 #endif
 	/* Besides rdi and rsi, saved above, save rcx, rdx, r8, r9,
 	   r10 and r11.  */
diff --git a/sysdeps/x86_64/fpu/multiarch/Makefile b/sysdeps/x86_64/fpu/multiarch/Makefile
index e823d2fcc..340342244 100644
--- a/sysdeps/x86_64/fpu/multiarch/Makefile
+++ b/sysdeps/x86_64/fpu/multiarch/Makefile
@@ -1,15 +1,18 @@
 ifeq ($(subdir),math)
 CFLAGS-e_asin-fma.c = -mfma -mavx2
 CFLAGS-e_atan2-fma.c = -mfma -mavx2
+CFLAGS-e_atanh-fma.c = -mfma -mavx2
 CFLAGS-e_exp-fma.c = -mfma -mavx2
 CFLAGS-e_log-fma.c = -mfma -mavx2
 CFLAGS-e_log2-fma.c = -mfma -mavx2
 CFLAGS-e_pow-fma.c = -mfma -mavx2
+CFLAGS-e_sinh-fma.c = -mfma -mavx2
 CFLAGS-s_atan-fma.c = -mfma -mavx2
 CFLAGS-s_expm1-fma.c = -mfma -mavx2
 CFLAGS-s_log1p-fma.c = -mfma -mavx2
 CFLAGS-s_sin-fma.c = -mfma -mavx2
 CFLAGS-s_tan-fma.c = -mfma -mavx2
+CFLAGS-s_tanh-fma.c = -mfma -mavx2
 CFLAGS-s_sincos-fma.c = -mfma -mavx2
 CFLAGS-s_exp10m1f-fma.c = -mfma -mavx2
 CFLAGS-s_exp2m1f-fma.c = -mfma -mavx2
@@ -57,6 +60,7 @@ libm-sysdep_routines += \
   e_asin-fma \
   e_atan2-avx \
   e_atan2-fma \
+  e_atanh-fma \
   e_exp-avx \
   e_exp-fma \
   e_exp2f-fma \
@@ -68,6 +72,7 @@ libm-sysdep_routines += \
   e_logf-fma \
   e_pow-fma \
   e_powf-fma \
+  e_sinh-fma \
   s_atan-avx \
   s_atan-fma \
   s_ceil-sse4_1 \
@@ -96,6 +101,7 @@ libm-sysdep_routines += \
   s_sinf-sse2 \
   s_tan-avx \
   s_tan-fma \
+  s_tanh-fma \
   s_trunc-sse4_1 \
   s_truncf-sse4_1 \
 # libm-sysdep_routines
diff --git a/sysdeps/x86_64/fpu/multiarch/e_atanh-fma.c b/sysdeps/x86_64/fpu/multiarch/e_atanh-fma.c
new file mode 100644
index 000000000..c3f2f9e55
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/e_atanh-fma.c
@@ -0,0 +1,6 @@
+#define __ieee754_atanh __ieee754_atanh_fma
+#define __log1p __log1p_fma
+
+#define SECTION __attribute__ ((section (".text.fma")))
+
+#include <sysdeps/ieee754/dbl-64/e_atanh.c>
diff --git a/sysdeps/x86_64/fpu/multiarch/e_atanh.c b/sysdeps/x86_64/fpu/multiarch/e_atanh.c
new file mode 100644
index 000000000..d2b785dfc
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/e_atanh.c
@@ -0,0 +1,34 @@
+/* Multiple versions of atanh.
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdeps/x86/isa-level.h>
+#if MINIMUM_X86_ISA_LEVEL < AVX2_X86_ISA_LEVEL
+# include <libm-alias-finite.h>
+
+extern double __redirect_ieee754_atanh (double);
+
+# define SYMBOL_NAME ieee754_atanh
+# include "ifunc-fma.h"
+
+libc_ifunc_redirected (__redirect_ieee754_atanh, __ieee754_atanh, IFUNC_SELECTOR ());
+
+libm_alias_finite (__ieee754_atanh, __atanh)
+
+# define __ieee754_atanh __ieee754_atanh_sse2
+#endif
+#include <sysdeps/ieee754/dbl-64/e_atanh.c>
diff --git a/sysdeps/x86_64/fpu/multiarch/e_sinh-fma.c b/sysdeps/x86_64/fpu/multiarch/e_sinh-fma.c
new file mode 100644
index 000000000..e0e1e39a7
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/e_sinh-fma.c
@@ -0,0 +1,12 @@
+#define __ieee754_sinh __ieee754_sinh_fma
+#define __ieee754_exp __ieee754_exp_fma
+#define __expm1 __expm1_fma
+
+/* NB: __expm1 may be expanded to __expm1_fma in the following
+   prototypes.  */
+extern long double __expm1l (long double);
+extern long double __expm1f128 (long double);
+
+#define SECTION __attribute__ ((section (".text.fma")))
+
+#include <sysdeps/ieee754/dbl-64/e_sinh.c>
diff --git a/sysdeps/x86_64/fpu/multiarch/e_sinh.c b/sysdeps/x86_64/fpu/multiarch/e_sinh.c
new file mode 100644
index 000000000..3d3c18ccd
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/e_sinh.c
@@ -0,0 +1,35 @@
+/* Multiple versions of sinh.
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdeps/x86/isa-level.h>
+#if MINIMUM_X86_ISA_LEVEL < AVX2_X86_ISA_LEVEL
+# include <libm-alias-finite.h>
+
+extern double __redirect_ieee754_sinh (double);
+
+# define SYMBOL_NAME ieee754_sinh
+# include "ifunc-fma.h"
+
+libc_ifunc_redirected (__redirect_ieee754_sinh, __ieee754_sinh,
+		       IFUNC_SELECTOR ());
+
+libm_alias_finite (__ieee754_sinh, __sinh)
+
+# define __ieee754_sinh __ieee754_sinh_sse2
+#endif
+#include <sysdeps/ieee754/dbl-64/e_sinh.c>
diff --git a/sysdeps/x86_64/fpu/multiarch/s_tanh-fma.c b/sysdeps/x86_64/fpu/multiarch/s_tanh-fma.c
new file mode 100644
index 000000000..1b808b122
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_tanh-fma.c
@@ -0,0 +1,11 @@
+#define __tanh __tanh_fma
+#define __expm1 __expm1_fma
+
+/* NB: __expm1 may be expanded to __expm1_fma in the following
+   prototypes.  */
+extern long double __expm1l (long double);
+extern long double __expm1f128 (long double);
+
+#define SECTION __attribute__ ((section (".text.fma")))
+
+#include <sysdeps/ieee754/dbl-64/s_tanh.c>
diff --git a/sysdeps/x86_64/fpu/multiarch/s_tanh.c b/sysdeps/x86_64/fpu/multiarch/s_tanh.c
new file mode 100644
index 000000000..5539b6c61
--- /dev/null
+++ b/sysdeps/x86_64/fpu/multiarch/s_tanh.c
@@ -0,0 +1,31 @@
+/* Multiple versions of tanh.
+   Copyright (C) 2025 Free Software Foundation, Inc.
+   This file is part of the GNU C Library.
+
+   The GNU C Library is free software; you can redistribute it and/or
+   modify it under the terms of the GNU Lesser General Public
+   License as published by the Free Software Foundation; either
+   version 2.1 of the License, or (at your option) any later version.
+
+   The GNU C Library is distributed in the hope that it will be useful,
+   but WITHOUT ANY WARRANTY; without even the implied warranty of
+   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+   Lesser General Public License for more details.
+
+   You should have received a copy of the GNU Lesser General Public
+   License along with the GNU C Library; if not, see
+   <https://www.gnu.org/licenses/>.  */
+
+#include <sysdeps/x86/isa-level.h>
+#if MINIMUM_X86_ISA_LEVEL < AVX2_X86_ISA_LEVEL
+
+extern double __redirect_tanh (double);
+
+# define SYMBOL_NAME tanh
+# include "ifunc-fma.h"
+
+libc_ifunc_redirected (__redirect_tanh, __tanh, IFUNC_SELECTOR ());
+
+# define __tanh __tanh_sse2
+#endif
+#include <sysdeps/ieee754/dbl-64/s_tanh.c>
diff --git a/sysdeps/x86_64/multiarch/ifunc-impl-list.c b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
index a8349775d..c2dcadd1a 100644
--- a/sysdeps/x86_64/multiarch/ifunc-impl-list.c
+++ b/sysdeps/x86_64/multiarch/ifunc-impl-list.c
@@ -922,7 +922,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				     (CPU_FEATURE_USABLE (AVX2)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __wcsncpy_avx2)
-	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcpncpy,
+	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcsncpy,
 				     1,
 				     __wcsncpy_generic))
 
@@ -952,7 +952,7 @@ __libc_ifunc_impl_list (const char *name, struct libc_ifunc_impl *array,
 				     (CPU_FEATURE_USABLE (AVX2)
 				      && CPU_FEATURE_USABLE (BMI2)),
 				     __wcpncpy_avx2)
-	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcsncpy,
+	      X86_IFUNC_IMPL_ADD_V2 (array, i, wcpncpy,
 				     1,
 				     __wcpncpy_generic))
 
diff --git a/sysdeps/x86_64/tst-auditmod10b.c b/sysdeps/x86_64/tst-auditmod10b.c
index 6eb21b6f0..0b994ef0f 100644
--- a/sysdeps/x86_64/tst-auditmod10b.c
+++ b/sysdeps/x86_64/tst-auditmod10b.c
@@ -125,7 +125,6 @@ la_symbind64 (Elf64_Sym *sym, unsigned int ndx, uintptr_t *refcook,
 
 #include <tst-audit.h>
 
-#ifdef __AVX512F__
 #include <immintrin.h>
 #include <cpuid.h>
 
@@ -148,9 +147,37 @@ check_avx512 (void)
   return (eax & 0xe6) == 0xe6;
 }
 
-#else
-#include <emmintrin.h>
-#endif
+void
+__attribute__ ((target ("avx512f")))
+pltenter_avx512f (La_regs *regs, long int *framesizep)
+{
+  __m512i zero = _mm512_setzero_si512 ();
+  if (memcmp (&regs->lr_vector[0], &zero, sizeof (zero))
+      || memcmp (&regs->lr_vector[1], &zero, sizeof (zero))
+      || memcmp (&regs->lr_vector[2], &zero, sizeof (zero))
+      || memcmp (&regs->lr_vector[3], &zero, sizeof (zero))
+      || memcmp (&regs->lr_vector[4], &zero, sizeof (zero))
+      || memcmp (&regs->lr_vector[5], &zero, sizeof (zero))
+      || memcmp (&regs->lr_vector[6], &zero, sizeof (zero))
+      || memcmp (&regs->lr_vector[7], &zero, sizeof (zero)))
+    abort ();
+
+  for (int i = 0; i < 8; i++)
+    regs->lr_vector[i].zmm[0]
+      = (La_x86_64_zmm) _mm512_set1_epi64 (i + 1);
+
+  __m512i zmm = _mm512_set1_epi64 (-1);
+  asm volatile ("vmovdqa64 %0, %%zmm0" : : "x" (zmm) : "xmm0" );
+  asm volatile ("vmovdqa64 %0, %%zmm1" : : "x" (zmm) : "xmm1" );
+  asm volatile ("vmovdqa64 %0, %%zmm2" : : "x" (zmm) : "xmm2" );
+  asm volatile ("vmovdqa64 %0, %%zmm3" : : "x" (zmm) : "xmm3" );
+  asm volatile ("vmovdqa64 %0, %%zmm4" : : "x" (zmm) : "xmm4" );
+  asm volatile ("vmovdqa64 %0, %%zmm5" : : "x" (zmm) : "xmm5" );
+  asm volatile ("vmovdqa64 %0, %%zmm6" : : "x" (zmm) : "xmm6" );
+  asm volatile ("vmovdqa64 %0, %%zmm7" : : "x" (zmm) : "xmm7" );
+
+  *framesizep = 1024;
+}
 
 ElfW(Addr)
 pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
@@ -160,39 +187,33 @@ pltenter (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
   printf ("pltenter: symname=%s, st_value=%#lx, ndx=%u, flags=%u\n",
 	  symname, (long int) sym->st_value, ndx, *flags);
 
-#ifdef __AVX512F__
   if (check_avx512 () && strcmp (symname, "audit_test") == 0)
+    pltenter_avx512f (regs, framesizep);
+
+  return sym->st_value;
+}
+
+void
+__attribute__ ((target ("avx512f")))
+pltexit_avx512f (const La_regs *inregs, La_retval *outregs)
+{
+  __m512i zero = _mm512_setzero_si512 ();
+  if (memcmp (&outregs->lrv_vector0, &zero, sizeof (zero)))
+    abort ();
+
+  for (int i = 0; i < 8; i++)
     {
-      __m512i zero = _mm512_setzero_si512 ();
-      if (memcmp (&regs->lr_vector[0], &zero, sizeof (zero))
-	  || memcmp (&regs->lr_vector[1], &zero, sizeof (zero))
-	  || memcmp (&regs->lr_vector[2], &zero, sizeof (zero))
-	  || memcmp (&regs->lr_vector[3], &zero, sizeof (zero))
-	  || memcmp (&regs->lr_vector[4], &zero, sizeof (zero))
-	  || memcmp (&regs->lr_vector[5], &zero, sizeof (zero))
-	  || memcmp (&regs->lr_vector[6], &zero, sizeof (zero))
-	  || memcmp (&regs->lr_vector[7], &zero, sizeof (zero)))
-	abort ();
-
-      for (int i = 0; i < 8; i++)
-	regs->lr_vector[i].zmm[0]
-	  = (La_x86_64_zmm) _mm512_set1_epi64 (i + 1);
-
-      __m512i zmm = _mm512_set1_epi64 (-1);
-      asm volatile ("vmovdqa64 %0, %%zmm0" : : "x" (zmm) : "xmm0" );
-      asm volatile ("vmovdqa64 %0, %%zmm1" : : "x" (zmm) : "xmm1" );
-      asm volatile ("vmovdqa64 %0, %%zmm2" : : "x" (zmm) : "xmm2" );
-      asm volatile ("vmovdqa64 %0, %%zmm3" : : "x" (zmm) : "xmm3" );
-      asm volatile ("vmovdqa64 %0, %%zmm4" : : "x" (zmm) : "xmm4" );
-      asm volatile ("vmovdqa64 %0, %%zmm5" : : "x" (zmm) : "xmm5" );
-      asm volatile ("vmovdqa64 %0, %%zmm6" : : "x" (zmm) : "xmm6" );
-      asm volatile ("vmovdqa64 %0, %%zmm7" : : "x" (zmm) : "xmm7" );
-
-      *framesizep = 1024;
+      __m512i zmm = _mm512_set1_epi64 (i + 1);
+      if (memcmp (&inregs->lr_vector[i], &zmm, sizeof (zmm)) != 0)
+        abort ();
     }
-#endif
 
-  return sym->st_value;
+  outregs->lrv_vector0.zmm[0]
+    = (La_x86_64_zmm) _mm512_set1_epi64 (0x12349876);
+
+  __m512i zmm = _mm512_set1_epi64 (-1);
+  asm volatile ("vmovdqa64 %0, %%zmm0" : : "x" (zmm) : "xmm0" );
+  asm volatile ("vmovdqa64 %0, %%zmm1" : : "x" (zmm) : "xmm1" );
 }
 
 unsigned int
@@ -204,28 +225,8 @@ pltexit (ElfW(Sym) *sym, unsigned int ndx, uintptr_t *refcook,
 	  symname, (long int) sym->st_value, ndx,
 	  (ptrdiff_t) outregs->int_retval);
 
-#ifdef __AVX512F__
   if (check_avx512 () && strcmp (symname, "audit_test") == 0)
-    {
-      __m512i zero = _mm512_setzero_si512 ();
-      if (memcmp (&outregs->lrv_vector0, &zero, sizeof (zero)))
-	abort ();
-
-      for (int i = 0; i < 8; i++)
-	{
-	  __m512i zmm = _mm512_set1_epi64 (i + 1);
-	  if (memcmp (&inregs->lr_vector[i], &zmm, sizeof (zmm)) != 0)
-	    abort ();
-	}
-
-      outregs->lrv_vector0.zmm[0]
-	= (La_x86_64_zmm) _mm512_set1_epi64 (0x12349876);
-
-      __m512i zmm = _mm512_set1_epi64 (-1);
-      asm volatile ("vmovdqa64 %0, %%zmm0" : : "x" (zmm) : "xmm0" );
-      asm volatile ("vmovdqa64 %0, %%zmm1" : : "x" (zmm) : "xmm1" );
-    }
-#endif
+    pltexit_avx512f (inregs, outregs);
 
   return 0;
 }
-- 
2.30.2